From feead09d8259f9e05bf6276fee51c913d4be95b9 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Thu, 11 Sep 2025 17:49:43 +0200 Subject: [PATCH 001/152] WCNF parser --- cpmpy/tools/wcnf/__init__.py | 90 ++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 cpmpy/tools/wcnf/__init__.py diff --git a/cpmpy/tools/wcnf/__init__.py b/cpmpy/tools/wcnf/__init__.py new file mode 100644 index 000000000..3446f0906 --- /dev/null +++ b/cpmpy/tools/wcnf/__init__.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## __init__.py +## +""" +Set of utilities for working with WCNF-formatted CP models. + + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read_wcnf +""" + + +import os +import lzma +import cpmpy as cp +from io import StringIO +from typing import Union + + +def _get_var(i, vars_dict): + """ + Returns CPMpy boolean decision variable matching to index `i` if exists, else creates a new decision variable. + + Arguments: + i: index + vars_dict (dict): dictionary to keep track of previously generated decision variables + """ + if i not in vars_dict: + vars_dict[i] = cp.boolvar(name=f"x{i}") # <- be carefull that name doesn't clash with generated variables during transformations / user variables + return vars_dict[i] + + +def read_wcnf(wcnf: Union[str, os.PathLike]) -> cp.Model: + """ + Parser for WCNF format. Reads in an instance and returns its matching CPMpy model. + + Arguments: + wcnf (str or os.PathLike): A string containing a WCNF-formatted model, or a path to a file containing containing the same. + + Returns: + cp.Model: The CPMpy model of the WCNF instance. + """ + # If wcnf is a path to a file -> open file + if isinstance(wcnf, (str, os.PathLike)) and os.path.exists(wcnf): + f_open = lzma.open if str(wcnf).endswith(".xz") else open + f = f_open(wcnf, "rt") + # If wcnf is a string containing a model -> create a memory-mapped file + else: + f = StringIO(wcnf) + + model = cp.Model() + vars = {} + soft_terms = [] + + for raw in f: + line = raw.strip() + + # Empty line or a comment -> skip + if not line or line.startswith("c"): + continue + + # Hard clause + if line[0] == "h": + literals = map(int, line[1:].split()) + clause = [_get_var(i, vars) if i > 0 else ~_get_var(-i, vars) + for i in literals if i != 0] + model.add(cp.any(clause)) + + # Soft clause (weight first) + else: + parts = line.split() + weight = int(parts[0]) + literals = map(int, parts[1:]) + clause = [_get_var(i, vars) if i > 0 else ~_get_var(-i, vars) + for i in literals if i != 0] + soft_terms.append(weight * cp.any(clause)) + + # Objective = sum of soft clause terms + if soft_terms: + model.maximize(sum(soft_terms)) + + return model \ No newline at end of file From 5ade48ec7a661123688a79979d262e591c44e21c Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Thu, 11 Sep 2025 18:02:32 +0200 Subject: [PATCH 002/152] Small docstring change --- cpmpy/tools/wcnf/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/wcnf/__init__.py b/cpmpy/tools/wcnf/__init__.py index 3446f0906..5ce83a146 100644 --- a/cpmpy/tools/wcnf/__init__.py +++ b/cpmpy/tools/wcnf/__init__.py @@ -43,7 +43,9 @@ def read_wcnf(wcnf: Union[str, os.PathLike]) -> cp.Model: Parser for WCNF format. Reads in an instance and returns its matching CPMpy model. Arguments: - wcnf (str or os.PathLike): A string containing a WCNF-formatted model, or a path to a file containing containing the same. + wcnf (str or os.PathLike): + - A file path to an WCNF file (optionally LZMA-compressed with `.xz`) + - OR a string containing the WCNF content directly Returns: cp.Model: The CPMpy model of the WCNF instance. From 7f52f5fc7694d6736c877889835ca4266db95109 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Thu, 11 Sep 2025 18:02:54 +0200 Subject: [PATCH 003/152] OPB parser --- cpmpy/tools/opb/__init__.py | 179 ++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 cpmpy/tools/opb/__init__.py diff --git a/cpmpy/tools/opb/__init__.py b/cpmpy/tools/opb/__init__.py new file mode 100644 index 000000000..963c021cb --- /dev/null +++ b/cpmpy/tools/opb/__init__.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## __init__.py +## +""" +Set of utilities for working with OPB-formatted CP models. + +Currently only the restricted OPB PB24 format is supported (without WBO). + + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read_opb +""" + + +import os +import re +import lzma +import cpmpy as cp +from io import StringIO +from typing import Union +from functools import reduce +from operator import mul + +# Regular expressions +HEADER_RE = re.compile(r'(.*)\s*#variable=\s*(\d+)\s*#constraint=\s*(\d+).*') +TERM_RE = re.compile(r"([+-]?\d+)((?:\s+~?x\d+)+)") +OBJ_TERM_RE = re.compile(r'^min:') +IND_TERM_RE = re.compile(r'([>=|<=|=]+)\s+([+-]?\d+)') +IND_TERM_RE = re.compile(r'(>=|<=|=)\s*([+-]?\d+)') + + +def _parse_term(line, vars): + """ + Parse a line containing OPB terms into a CPMpy expression. + + Supports: + - Linear terms (e.g., +2 x1) + - Non-linear terms (e.g., -1 x1 x14) + - Negated variables using '~' (e.g., ~x5) + + Arguments: + line (str): A string containing one or more terms. + vars (list[cp.boolvar]): List or array of CPMpy Boolean variables. + + Returns: + cp.Expression: A CPMpy expression representing the sum of all parsed terms. + + Example: + >>> _parse_term("2 x2 x3 +3 x4 ~x5", vars) + sum([2, 3] * [(IV2*IV3), (IV4*~IV5)]) + """ + + terms = [] + for w, vars_str in TERM_RE.findall(line): + factors = [] + + for v in vars_str.split(): + if v.startswith("~x"): + idx = int(v[2:]) # remove "~x" + factors.append(~vars[idx]) + else: + idx = int(v[1:]) # remove "x" + factors.append(vars[idx]) + + term = int(w) * reduce(mul, factors, 1) # create weighted term + terms.append(term) + + return cp.sum(terms) + +def _parse_constraint(line, vars): + """ + Parse a single OPB constraint line into a CPMpy comparison expression. + + Arguments: + line (str): A string representing a single OPB constraint. + vars (list[cp.boolvar]): List or array of CPMpy Boolean variables. Will be index to get the variables for the constraint. + + Returns: + cp.expressions.core.Comparison: A CPMpy comparison expression representing + the constraint. + + Example: + >>> _parse_constraint("-1 x1 x14 -1 x1 ~x17 >= -1", vars) + sum([-1, -1] * [(IV1*IV14), (IV1*~IV17)]) >= -1 + """ + + op, ind_term = IND_TERM_RE.search(line).groups() + lhs = _parse_term(line, vars) + + rhs = int(ind_term) if ind_term.lstrip("+-").isdigit() else vars[int(ind_term)] + + return cp.expressions.core.Comparison( + name="==" if op == "=" else ">=", + left=lhs, + right=rhs + ) + +def read_opb(opb: Union[str, os.PathLike]) -> cp.Model: + """ + Parser for OPB (Pseudo-Boolean) format. Reads in an instance and returns its matching CPMpy model. + + Based on PyPBLib's example parser: https://hardlog.udl.cat/static/doc/pypblib/html/library/index.html#example-from-opb-to-cnf-file + + Supports: + - Linear and non-linear terms (e.g., -1 x1 x14 +2 x2) + - Negated variables using '~' (e.g., ~x5) + - Minimisation objective + - Comparison operators in constraints: '=', '>=' + + Arguments: + opb (str or os.PathLike): + - A file path to an OPB file (optionally LZMA-compressed with `.xz`) + - OR a string containing the OPB content directly + + Returns: + cp.Model: The CPMpy model of the OPB instance. + + Example: + >>> opb_text = ''' + ... * #variable= 5 #constraint= 2 #equal= 1 intsize= 64 #product= 5 sizeproduct= 13 + ... min: 2 x2 x3 +3 x4 ~x5 +2 ~x1 x2 +3 ~x1 x2 x3 ~x4 ~x5 ; + ... 2 x2 x3 -1 x1 ~x3 = 5 ; + ... ''' + >>> model = read_opb(opb_text) + >>> print(model) + Model(...) + + Notes: + - Comment lines starting with '*' are ignored. + - Only "min:" objectives are supported; "max:" is not recognized. + """ + + + # If opb is a path to a file -> open file + if isinstance(opb, (str, os.PathLike)) and os.path.exists(opb): + f_open = lzma.open if str(opb).endswith(".xz") else open + f = f_open(opb, 'rt') + # If opb is a string containing a model -> create a memory-mapped file + else: + f = StringIO(opb) + + # Look for header on first line + line = f.readline() + header = HEADER_RE.match(line) + if not header: # If not found on first line, look on second (happens when passing multi line string) + _line = f.readline() + header = HEADER_RE.match(_line) + if not header: + raise ValueError(f"Missing or incorrect header: \n0: {line}1: {_line}2: ...") + nr_vars = int(header.group(2)) + 1 + + # Generator without comment lines + reader = (l for l in map(str.strip, f) if l and l[0] != '*') + + # CPMpy objects + vars = cp.boolvar(shape=nr_vars, name="x") + model = cp.Model() + + # Special case for first line -> might contain objective function + first_line = next(reader) + if OBJ_TERM_RE.match(first_line): + obj_expr = _parse_term(first_line, vars) + model.minimize(obj_expr) + else: # no objective found, parse as a constraint instead + model.add(_parse_constraint(first_line, vars)) + + # Start parsing line by line + for line in reader: + model.add(_parse_constraint(line, vars)) + + return model \ No newline at end of file From 548de8e13dd30137fd84031ccafc66bdb9f85bf1 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 09:35:49 +0200 Subject: [PATCH 004/152] Move parser out of init and add cli --- cpmpy/tools/opb/__init__.py | 168 +------------------------- cpmpy/tools/opb/parser.py | 221 +++++++++++++++++++++++++++++++++++ cpmpy/tools/wcnf/__init__.py | 82 +------------ cpmpy/tools/wcnf/parser.py | 133 +++++++++++++++++++++ 4 files changed, 364 insertions(+), 240 deletions(-) create mode 100644 cpmpy/tools/opb/parser.py create mode 100644 cpmpy/tools/wcnf/parser.py diff --git a/cpmpy/tools/opb/__init__.py b/cpmpy/tools/opb/__init__.py index 963c021cb..ae751c7e7 100644 --- a/cpmpy/tools/opb/__init__.py +++ b/cpmpy/tools/opb/__init__.py @@ -8,172 +8,14 @@ Currently only the restricted OPB PB24 format is supported (without WBO). - -================= -List of functions -================= +================== +List of submodules +================== .. autosummary:: :nosignatures: - read_opb + parser """ - -import os -import re -import lzma -import cpmpy as cp -from io import StringIO -from typing import Union -from functools import reduce -from operator import mul - -# Regular expressions -HEADER_RE = re.compile(r'(.*)\s*#variable=\s*(\d+)\s*#constraint=\s*(\d+).*') -TERM_RE = re.compile(r"([+-]?\d+)((?:\s+~?x\d+)+)") -OBJ_TERM_RE = re.compile(r'^min:') -IND_TERM_RE = re.compile(r'([>=|<=|=]+)\s+([+-]?\d+)') -IND_TERM_RE = re.compile(r'(>=|<=|=)\s*([+-]?\d+)') - - -def _parse_term(line, vars): - """ - Parse a line containing OPB terms into a CPMpy expression. - - Supports: - - Linear terms (e.g., +2 x1) - - Non-linear terms (e.g., -1 x1 x14) - - Negated variables using '~' (e.g., ~x5) - - Arguments: - line (str): A string containing one or more terms. - vars (list[cp.boolvar]): List or array of CPMpy Boolean variables. - - Returns: - cp.Expression: A CPMpy expression representing the sum of all parsed terms. - - Example: - >>> _parse_term("2 x2 x3 +3 x4 ~x5", vars) - sum([2, 3] * [(IV2*IV3), (IV4*~IV5)]) - """ - - terms = [] - for w, vars_str in TERM_RE.findall(line): - factors = [] - - for v in vars_str.split(): - if v.startswith("~x"): - idx = int(v[2:]) # remove "~x" - factors.append(~vars[idx]) - else: - idx = int(v[1:]) # remove "x" - factors.append(vars[idx]) - - term = int(w) * reduce(mul, factors, 1) # create weighted term - terms.append(term) - - return cp.sum(terms) - -def _parse_constraint(line, vars): - """ - Parse a single OPB constraint line into a CPMpy comparison expression. - - Arguments: - line (str): A string representing a single OPB constraint. - vars (list[cp.boolvar]): List or array of CPMpy Boolean variables. Will be index to get the variables for the constraint. - - Returns: - cp.expressions.core.Comparison: A CPMpy comparison expression representing - the constraint. - - Example: - >>> _parse_constraint("-1 x1 x14 -1 x1 ~x17 >= -1", vars) - sum([-1, -1] * [(IV1*IV14), (IV1*~IV17)]) >= -1 - """ - - op, ind_term = IND_TERM_RE.search(line).groups() - lhs = _parse_term(line, vars) - - rhs = int(ind_term) if ind_term.lstrip("+-").isdigit() else vars[int(ind_term)] - - return cp.expressions.core.Comparison( - name="==" if op == "=" else ">=", - left=lhs, - right=rhs - ) - -def read_opb(opb: Union[str, os.PathLike]) -> cp.Model: - """ - Parser for OPB (Pseudo-Boolean) format. Reads in an instance and returns its matching CPMpy model. - - Based on PyPBLib's example parser: https://hardlog.udl.cat/static/doc/pypblib/html/library/index.html#example-from-opb-to-cnf-file - - Supports: - - Linear and non-linear terms (e.g., -1 x1 x14 +2 x2) - - Negated variables using '~' (e.g., ~x5) - - Minimisation objective - - Comparison operators in constraints: '=', '>=' - - Arguments: - opb (str or os.PathLike): - - A file path to an OPB file (optionally LZMA-compressed with `.xz`) - - OR a string containing the OPB content directly - - Returns: - cp.Model: The CPMpy model of the OPB instance. - - Example: - >>> opb_text = ''' - ... * #variable= 5 #constraint= 2 #equal= 1 intsize= 64 #product= 5 sizeproduct= 13 - ... min: 2 x2 x3 +3 x4 ~x5 +2 ~x1 x2 +3 ~x1 x2 x3 ~x4 ~x5 ; - ... 2 x2 x3 -1 x1 ~x3 = 5 ; - ... ''' - >>> model = read_opb(opb_text) - >>> print(model) - Model(...) - - Notes: - - Comment lines starting with '*' are ignored. - - Only "min:" objectives are supported; "max:" is not recognized. - """ - - - # If opb is a path to a file -> open file - if isinstance(opb, (str, os.PathLike)) and os.path.exists(opb): - f_open = lzma.open if str(opb).endswith(".xz") else open - f = f_open(opb, 'rt') - # If opb is a string containing a model -> create a memory-mapped file - else: - f = StringIO(opb) - - # Look for header on first line - line = f.readline() - header = HEADER_RE.match(line) - if not header: # If not found on first line, look on second (happens when passing multi line string) - _line = f.readline() - header = HEADER_RE.match(_line) - if not header: - raise ValueError(f"Missing or incorrect header: \n0: {line}1: {_line}2: ...") - nr_vars = int(header.group(2)) + 1 - - # Generator without comment lines - reader = (l for l in map(str.strip, f) if l and l[0] != '*') - - # CPMpy objects - vars = cp.boolvar(shape=nr_vars, name="x") - model = cp.Model() - - # Special case for first line -> might contain objective function - first_line = next(reader) - if OBJ_TERM_RE.match(first_line): - obj_expr = _parse_term(first_line, vars) - model.minimize(obj_expr) - else: # no objective found, parse as a constraint instead - model.add(_parse_constraint(first_line, vars)) - - # Start parsing line by line - for line in reader: - model.add(_parse_constraint(line, vars)) - - return model \ No newline at end of file +from .parser import read_opb diff --git a/cpmpy/tools/opb/parser.py b/cpmpy/tools/opb/parser.py new file mode 100644 index 000000000..846c0874b --- /dev/null +++ b/cpmpy/tools/opb/parser.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## __init__.py +## +""" +OPB parser. + +Currently only the restricted OPB PB24 format is supported (without WBO). + + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read_opb +""" + + +import os +import re +import sys +import lzma +import argparse +import cpmpy as cp +from io import StringIO +from typing import Union +from functools import reduce +from operator import mul + +# Regular expressions +HEADER_RE = re.compile(r'(.*)\s*#variable=\s*(\d+)\s*#constraint=\s*(\d+).*') +TERM_RE = re.compile(r"([+-]?\d+)((?:\s+~?x\d+)+)") +OBJ_TERM_RE = re.compile(r'^min:') +IND_TERM_RE = re.compile(r'([>=|<=|=]+)\s+([+-]?\d+)') +IND_TERM_RE = re.compile(r'(>=|<=|=)\s*([+-]?\d+)') + + +def _parse_term(line, vars): + """ + Parse a line containing OPB terms into a CPMpy expression. + + Supports: + - Linear terms (e.g., +2 x1) + - Non-linear terms (e.g., -1 x1 x14) + - Negated variables using '~' (e.g., ~x5) + + Arguments: + line (str): A string containing one or more terms. + vars (list[cp.boolvar]): List or array of CPMpy Boolean variables. + + Returns: + cp.Expression: A CPMpy expression representing the sum of all parsed terms. + + Example: + >>> _parse_term("2 x2 x3 +3 x4 ~x5", vars) + sum([2, 3] * [(IV2*IV3), (IV4*~IV5)]) + """ + + terms = [] + for w, vars_str in TERM_RE.findall(line): + factors = [] + + for v in vars_str.split(): + if v.startswith("~x"): + idx = int(v[2:]) # remove "~x" + factors.append(~vars[idx]) + else: + idx = int(v[1:]) # remove "x" + factors.append(vars[idx]) + + term = int(w) * reduce(mul, factors, 1) # create weighted term + terms.append(term) + + return cp.sum(terms) + +def _parse_constraint(line, vars): + """ + Parse a single OPB constraint line into a CPMpy comparison expression. + + Arguments: + line (str): A string representing a single OPB constraint. + vars (list[cp.boolvar]): List or array of CPMpy Boolean variables. Will be index to get the variables for the constraint. + + Returns: + cp.expressions.core.Comparison: A CPMpy comparison expression representing + the constraint. + + Example: + >>> _parse_constraint("-1 x1 x14 -1 x1 ~x17 >= -1", vars) + sum([-1, -1] * [(IV1*IV14), (IV1*~IV17)]) >= -1 + """ + + op, ind_term = IND_TERM_RE.search(line).groups() + lhs = _parse_term(line, vars) + + rhs = int(ind_term) if ind_term.lstrip("+-").isdigit() else vars[int(ind_term)] + + return cp.expressions.core.Comparison( + name="==" if op == "=" else ">=", + left=lhs, + right=rhs + ) + +def read_opb(opb: Union[str, os.PathLike]) -> cp.Model: + """ + Parser for OPB (Pseudo-Boolean) format. Reads in an instance and returns its matching CPMpy model. + + Based on PyPBLib's example parser: https://hardlog.udl.cat/static/doc/pypblib/html/library/index.html#example-from-opb-to-cnf-file + + Supports: + - Linear and non-linear terms (e.g., -1 x1 x14 +2 x2) + - Negated variables using '~' (e.g., ~x5) + - Minimisation objective + - Comparison operators in constraints: '=', '>=' + + Arguments: + opb (str or os.PathLike): + - A file path to an OPB file (optionally LZMA-compressed with `.xz`) + - OR a string containing the OPB content directly + + Returns: + cp.Model: The CPMpy model of the OPB instance. + + Example: + >>> opb_text = ''' + ... * #variable= 5 #constraint= 2 #equal= 1 intsize= 64 #product= 5 sizeproduct= 13 + ... min: 2 x2 x3 +3 x4 ~x5 +2 ~x1 x2 +3 ~x1 x2 x3 ~x4 ~x5 ; + ... 2 x2 x3 -1 x1 ~x3 = 5 ; + ... ''' + >>> model = read_opb(opb_text) + >>> print(model) + Model(...) + + Notes: + - Comment lines starting with '*' are ignored. + - Only "min:" objectives are supported; "max:" is not recognized. + """ + + + # If opb is a path to a file -> open file + if isinstance(opb, (str, os.PathLike)) and os.path.exists(opb): + f_open = lzma.open if str(opb).endswith(".xz") else open + f = f_open(opb, 'rt') + # If opb is a string containing a model -> create a memory-mapped file + else: + f = StringIO(opb) + + # Look for header on first line + line = f.readline() + header = HEADER_RE.match(line) + if not header: # If not found on first line, look on second (happens when passing multi line string) + _line = f.readline() + header = HEADER_RE.match(_line) + if not header: + raise ValueError(f"Missing or incorrect header: \n0: {line}1: {_line}2: ...") + nr_vars = int(header.group(2)) + 1 + + # Generator without comment lines + reader = (l for l in map(str.strip, f) if l and l[0] != '*') + + # CPMpy objects + vars = cp.boolvar(shape=nr_vars, name="x") + model = cp.Model() + + # Special case for first line -> might contain objective function + first_line = next(reader) + if OBJ_TERM_RE.match(first_line): + obj_expr = _parse_term(first_line, vars) + model.minimize(obj_expr) + else: # no objective found, parse as a constraint instead + model.add(_parse_constraint(first_line, vars)) + + # Start parsing line by line + for line in reader: + model.add(_parse_constraint(line, vars)) + + return model + + +def main(): + parser = argparse.ArgumentParser(description="Parse and solve an OPB model using CPMpy") + parser.add_argument("model", help="Path to an OPB file (or raw OPB string if --string is given)") + parser.add_argument("-s", "--solver", default=None, help="Solver name to use (default: CPMpy's default)") + parser.add_argument("--string", action="store_true", help="Interpret the first argument (model) as a raw OPB string instead of a file path") + parser.add_argument("-t", "--time-limit", type=int, default=None, help="Time limit for the solver in seconds (default: no limit)") + args = parser.parse_args() + + # Build the CPMpy model + try: + if args.string: + model = read_opb(args.model) + else: + model = read_opb(os.path.expanduser(args.model)) + except Exception as e: + sys.stderr.write(f"Error reading model: {e}\n") + sys.exit(1) + + # Solve the model + try: + if args.solver: + result = model.solve(solver=args.solver, time_limit=args.time_limit) + else: + result = model.solve(time_limit=args.time_limit) + except Exception as e: + sys.stderr.write(f"Error solving model: {e}\n") + sys.exit(1) + + # Print results + print("Status:", model.status()) + if result is not None: + if model.has_objective(): + print("Objective:", model.objective_value()) + else: + print("No solution found.") + +if __name__ == "__main__": + main() diff --git a/cpmpy/tools/wcnf/__init__.py b/cpmpy/tools/wcnf/__init__.py index 5ce83a146..e2db10412 100644 --- a/cpmpy/tools/wcnf/__init__.py +++ b/cpmpy/tools/wcnf/__init__.py @@ -7,86 +7,14 @@ Set of utilities for working with WCNF-formatted CP models. -================= -List of functions -================= +================== +List of submodules +================== .. autosummary:: :nosignatures: - read_wcnf + parser """ - -import os -import lzma -import cpmpy as cp -from io import StringIO -from typing import Union - - -def _get_var(i, vars_dict): - """ - Returns CPMpy boolean decision variable matching to index `i` if exists, else creates a new decision variable. - - Arguments: - i: index - vars_dict (dict): dictionary to keep track of previously generated decision variables - """ - if i not in vars_dict: - vars_dict[i] = cp.boolvar(name=f"x{i}") # <- be carefull that name doesn't clash with generated variables during transformations / user variables - return vars_dict[i] - - -def read_wcnf(wcnf: Union[str, os.PathLike]) -> cp.Model: - """ - Parser for WCNF format. Reads in an instance and returns its matching CPMpy model. - - Arguments: - wcnf (str or os.PathLike): - - A file path to an WCNF file (optionally LZMA-compressed with `.xz`) - - OR a string containing the WCNF content directly - - Returns: - cp.Model: The CPMpy model of the WCNF instance. - """ - # If wcnf is a path to a file -> open file - if isinstance(wcnf, (str, os.PathLike)) and os.path.exists(wcnf): - f_open = lzma.open if str(wcnf).endswith(".xz") else open - f = f_open(wcnf, "rt") - # If wcnf is a string containing a model -> create a memory-mapped file - else: - f = StringIO(wcnf) - - model = cp.Model() - vars = {} - soft_terms = [] - - for raw in f: - line = raw.strip() - - # Empty line or a comment -> skip - if not line or line.startswith("c"): - continue - - # Hard clause - if line[0] == "h": - literals = map(int, line[1:].split()) - clause = [_get_var(i, vars) if i > 0 else ~_get_var(-i, vars) - for i in literals if i != 0] - model.add(cp.any(clause)) - - # Soft clause (weight first) - else: - parts = line.split() - weight = int(parts[0]) - literals = map(int, parts[1:]) - clause = [_get_var(i, vars) if i > 0 else ~_get_var(-i, vars) - for i in literals if i != 0] - soft_terms.append(weight * cp.any(clause)) - - # Objective = sum of soft clause terms - if soft_terms: - model.maximize(sum(soft_terms)) - - return model \ No newline at end of file +from .parser import read_wcnf diff --git a/cpmpy/tools/wcnf/parser.py b/cpmpy/tools/wcnf/parser.py new file mode 100644 index 000000000..72cec94c8 --- /dev/null +++ b/cpmpy/tools/wcnf/parser.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## __init__.py +## +""" +Parser for the WCNF format. + + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read_wcnf +""" + + +import os +import sys +import lzma +import argparse +import cpmpy as cp +from io import StringIO +from typing import Union + + +def _get_var(i, vars_dict): + """ + Returns CPMpy boolean decision variable matching to index `i` if exists, else creates a new decision variable. + + Arguments: + i: index + vars_dict (dict): dictionary to keep track of previously generated decision variables + """ + if i not in vars_dict: + vars_dict[i] = cp.boolvar(name=f"x{i}") # <- be carefull that name doesn't clash with generated variables during transformations / user variables + return vars_dict[i] + + +def read_wcnf(wcnf: Union[str, os.PathLike]) -> cp.Model: + """ + Parser for WCNF format. Reads in an instance and returns its matching CPMpy model. + + Arguments: + wcnf (str or os.PathLike): + - A file path to an WCNF file (optionally LZMA-compressed with `.xz`) + - OR a string containing the WCNF content directly + + Returns: + cp.Model: The CPMpy model of the WCNF instance. + """ + # If wcnf is a path to a file -> open file + if isinstance(wcnf, (str, os.PathLike)) and os.path.exists(wcnf): + f_open = lzma.open if str(wcnf).endswith(".xz") else open + f = f_open(wcnf, "rt") + # If wcnf is a string containing a model -> create a memory-mapped file + else: + f = StringIO(wcnf) + + model = cp.Model() + vars = {} + soft_terms = [] + + for raw in f: + line = raw.strip() + + # Empty line or a comment -> skip + if not line or line.startswith("c"): + continue + + # Hard clause + if line[0] == "h": + literals = map(int, line[1:].split()) + clause = [_get_var(i, vars) if i > 0 else ~_get_var(-i, vars) + for i in literals if i != 0] + model.add(cp.any(clause)) + + # Soft clause (weight first) + else: + parts = line.split() + weight = int(parts[0]) + literals = map(int, parts[1:]) + clause = [_get_var(i, vars) if i > 0 else ~_get_var(-i, vars) + for i in literals if i != 0] + soft_terms.append(weight * cp.any(clause)) + + # Objective = sum of soft clause terms + if soft_terms: + model.maximize(sum(soft_terms)) + + return model + +def main(): + parser = argparse.ArgumentParser(description="Parse and solve a WCNF model using CPMpy") + parser.add_argument("model", help="Path to a WCNF file (or raw WCNF string if --string is given)") + parser.add_argument("-s", "--solver", default=None, help="Solver name to use (default: CPMpy's default)") + parser.add_argument("--string", action="store_true", help="Interpret the first argument (model) as a raw WCNF string instead of a file path") + parser.add_argument("-t", "--time-limit", type=int, default=None, help="Time limit for the solver in seconds (default: no limit)") + args = parser.parse_args() + + # Build the CPMpy model + try: + if args.string: + model = read_wcnf(args.model) + else: + model = read_wcnf(os.path.expanduser(args.model)) + except Exception as e: + sys.stderr.write(f"Error reading model: {e}\n") + sys.exit(1) + + # Solve the model + try: + if args.solver: + result = model.solve(solver=args.solver, time_limit=args.time_limit) + else: + result = model.solve(time_limit=args.time_limit) + except Exception as e: + sys.stderr.write(f"Error solving model: {e}\n") + sys.exit(1) + + # Print results + print("Status:", model.status()) + if result is not None: + if model.has_objective(): + print("Objective:", model.objective_value()) + else: + print("No solution found.") + +if __name__ == "__main__": + main() \ No newline at end of file From 450502570a36a8958610b36554dd2ced0f0814e7 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 11:53:31 +0200 Subject: [PATCH 005/152] Add MSE and OPB datasets --- cpmpy/tools/datasets/_base.py | 85 +++++++++++++++++++ cpmpy/tools/datasets/model/mse.py | 104 +++++++++++++++++++++++ cpmpy/tools/datasets/model/opb.py | 135 ++++++++++++++++++++++++++++++ 3 files changed, 324 insertions(+) create mode 100644 cpmpy/tools/datasets/_base.py create mode 100644 cpmpy/tools/datasets/model/mse.py create mode 100644 cpmpy/tools/datasets/model/opb.py diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/datasets/_base.py new file mode 100644 index 000000000..3c5338489 --- /dev/null +++ b/cpmpy/tools/datasets/_base.py @@ -0,0 +1,85 @@ +""" +Dataset Base Class + +This module defines the abstract `_Dataset` class, which serves as the foundation +for loading and managing benchmark instance collections in CPMpy-based experiments. +It standardizes how datasets are stored, accessed, and optionally transformed. +""" + +from abc import ABC, abstractmethod +import pathlib +from typing import Any, Tuple + +class _Dataset(ABC): + """ + Abstract base class for PyTorch-style datasets of benchmarking instances. + + The `_Dataset` class provides a standardized interface for downloading and + accessing benchmark instances. This class should not be used on its own. + """ + + def __init__( + self, + dataset_dir: str = ".", + transform=None, target_transform=None, + download: bool = False, + extension:str=".txt", + **kwargs + ): + self.dataset_dir = pathlib.Path(dataset_dir) + self.transform = transform + self.target_transform = target_transform + self.extension = extension + + if not self.dataset_dir.exists(): + if not download: + raise ValueError(f"Dataset not found. Please set download=True to download the dataset.") + else: + self.download() + + @abstractmethod + def category(self): + pass + + @abstractmethod + def download(self, *args, **kwargs): + pass + + def metadata(self, file): + metadata = self.category() | { + 'name': pathlib.Path(file).stem.replace(self.extension, ''), + 'path': file, + } + return metadata + + def __len__(self) -> int: + """Return the total number of instances.""" + return len(list(self.dataset_dir.glob(f"*{self.extension}"))) + + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + + if index < 0 or index >= len(self): + raise IndexError("Index out of range") + + # Get all compressed XML files and sort for deterministic behavior + files = sorted(list(self.dataset_dir.glob(f"*{self.extension}"))) + file_path = files[index] + + filename = str(file_path) + if self.transform: + # does not need to remain a filename... + filename = self.transform(filename) + + # Basic metadata about the instance + metadata = self.metadata(file=filename, ) + if self.target_transform: + metadata = self.target_transform(metadata) + + return filename, metadata + + + + + + diff --git a/cpmpy/tools/datasets/model/mse.py b/cpmpy/tools/datasets/model/mse.py new file mode 100644 index 000000000..a749d75d0 --- /dev/null +++ b/cpmpy/tools/datasets/model/mse.py @@ -0,0 +1,104 @@ +""" +MaxSAT Evaluation (MSE) Dataset + +https://maxsat-evaluations.github.io/ +""" + +import zipfile +import pathlib +from urllib.request import urlretrieve +from urllib.error import HTTPError, URLError + +from .._base import _Dataset + + +class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible + """ + MaxSAT Evaluation (MSE) benchmark dataset. + + Provides access to benchmark instances from the MaxSAT Evaluation + competitions. Instances are grouped by `year` and `track` (e.g., + `"exact-unweighted"`, `"exact-weighted"`) and stored as `.wcnf.xz` files. + If the dataset is not available locally, it can be automatically + downloaded and extracted. + + More information on the competition can be found here: https://maxsat-evaluations.github.io/ + """ + + def __init__( + self, + root: str = ".", + year: int = 2024, track: str = "exact-unweighted", + transform=None, target_transform=None, + download: bool = False + ): + """ + Constructor for a dataset object of the MSE competition. + + Arguments: + root (str): Root directory where datasets are stored or will be downloaded to (default="."). + year (int): Competition year of the dataset to use (default=2024). + track (str): Track name specifying which subset of the competition instances to load (default="exact-unweighted"). + transform (callable, optional): Optional transform applied to the instance file path. + target_transform (callable, optional): Optional transform applied to the metadata dictionary. + download (bool): If True, downloads the dataset if it does not exist locally (default=False). + + + Raises: + ValueError: If the dataset directory does not exist and `download=False`, + or if the requested year/track combination is not available. + """ + + self.root = pathlib.Path(root) + self.year = year + self.track = track + + dataset_dir = self.root / str(year) / track + + super().__init__( + dataset_dir=dataset_dir, + transform=transform, target_transform=target_transform, + download=download, extension=".wcnf.xz" + ) + + + def category(self): + return { + "year": self.year, + "track": self.track + } + + + def download(self): + print(f"Downloading MaxSAT Eval {self.year} {self.track} instances...") + + zip_name = f"mse{str(self.year)[2:]}-{self.track}.zip" + url = f"https://www.cs.helsinki.fi/group/coreo/MSE{self.year}-instances/" + + url_path = url + zip_name + zip_path = self.root / zip_name + + try: + urlretrieve(url_path, str(zip_path)) + except (HTTPError, URLError) as e: + raise ValueError(f"No dataset available for year {self.year} and track {self.track}. Error: {str(e)}") + + # Extract only the specific track folder from the tar + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + # Create track folder in root directory, parents=True ensures recursive creation + self.dataset_dir.mkdir(parents=True, exist_ok=True) + + # Extract files + for file_info in zip_ref.infolist(): + # Extract file to family_dir, removing main_folder/track prefix + filename = pathlib.Path(file_info.filename).name + with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: + target.write(source.read()) + # Clean up the zip file + zip_path.unlink() + + +if __name__ == "__main__": + dataset = MSEDataset(year=2024, track="exact-weighted", download=True) + print("Dataset size:", len(dataset)) + print("Instance 0:", dataset[0]) diff --git a/cpmpy/tools/datasets/model/opb.py b/cpmpy/tools/datasets/model/opb.py new file mode 100644 index 000000000..d3602954c --- /dev/null +++ b/cpmpy/tools/datasets/model/opb.py @@ -0,0 +1,135 @@ +""" +Pseudo Boolean Competition (PB) Dataset + +https://www.cril.univ-artois.fr/PB25/ +""" + +import os +import pathlib +from urllib.request import urlretrieve +from urllib.error import HTTPError, URLError +import tarfile + +from .._base import _Dataset + + +class OPBDataset(_Dataset): + """ + Pseudo Boolean Competition (PB) benchmark dataset. + + Provides access to benchmark instances from the Pseudo Boolean + competitions. Instances are grouped by `year` and `track` (e.g., + `"OPT-LIN"`, `"DEC-LIN"`) and stored as `.opb.xz` files. + If the dataset is not available locally, it can be automatically + downloaded and extracted. + + More information on the competition can be found here: https://www.cril.univ-artois.fr/PB25/ + """ + + def __init__( + self, + root: str = ".", + year: int = 2024, track: str = "OPT-LIN", + transform=None, target_transform=None, + download: bool = False + ): + """ + Constructor for a dataset object of the PB competition. + + Arguments: + root (str): Root directory where datasets are stored or will be downloaded to (default="."). + year (int): Competition year of the dataset to use (default=2024). + track (str): Track name specifying which subset of the competition instances to load (default="OPT-LIN"). + transform (callable, optional): Optional transform applied to the instance file path. + target_transform (callable, optional): Optional transform applied to the metadata dictionary. + download (bool): If True, downloads the dataset if it does not exist locally (default=False). + + + Raises: + ValueError: If the dataset directory does not exist and `download=False`, + or if the requested year/track combination is not available. + """ + + self.root = pathlib.Path(root) + self.year = year + self.track = track + + dataset_dir = self.root / str(year) / track + + super().__init__( + dataset_dir=dataset_dir, + transform=transform, target_transform=target_transform, + download=download, extension=".opb.xz" + ) + + def category(self): + return { + "year": self.year, + "track": self.track + } + + def metadata(self, file): + return super().metadata(file) | {'author': str(file).split(os.sep)[-1].split("_")[0],} + + + def download(self): + # TODO: add option to filter on competition instances + print(f"Downloading OPB {self.year} {self.track} instances...") + url = f"https://www.cril.univ-artois.fr/PB24/benchs/" + year_suffix = str(self.year)[2:] # Drop the starting '20' + url_path = url + f"normalized-PB{year_suffix}.tar" + tar_path = self.root / f"normalized-extraPB{year_suffix}.tar" + + try: + urlretrieve(url_path, str(tar_path)) + except (HTTPError, URLError) as e: + raise ValueError(f"No dataset available for year {self.year}. Error: {str(e)}") + + # Extract only the specific track folder from the tar + with tarfile.open(tar_path, "r:*") as tar_ref: # r:* handles .tar, .tar.gz, .tar.bz2, etc. + # Get the main folder name + main_folder = None + for name in tar_ref.getnames(): + if "/" in name: + main_folder = name.split("/")[0] + break + + if main_folder is None: + raise ValueError(f"Could not find main folder in tar file") + + # Extract only files from the specified track + # Get all unique track names from tar + tracks = set() + for member in tar_ref.getmembers(): + parts = member.name.split("/") + if len(parts) > 2 and parts[0] == main_folder: + tracks.add(parts[1]) + + # Check if requested track exists + if self.track not in tracks: + raise ValueError(f"Track '{self.track}' not found in dataset. Available tracks: {sorted(tracks)}") + + # Create track folder in root directory + self.dataset_dir.mkdir(parents=True, exist_ok=True) + + # Extract files for the specified track + prefix = f"{main_folder}/{self.track}/" + for member in tar_ref.getmembers(): + if member.name.startswith(prefix) and member.isfile(): + # Path relative to main_folder/track + relative_path = member.name[len(prefix):] + + # Flatten: replace "/" with "_" to encode subfolders (some instances have clashing names) + flat_name = relative_path.replace("/", "_") + target_path = self.dataset_dir / flat_name + + with tar_ref.extractfile(member) as source, open(target_path, "wb") as target: + target.write(source.read()) + + # Clean up the tar file + tar_path.unlink() + +if __name__ == "__main__": + dataset = OPBDataset(year=2024, track="DEC-LIN", download=True) + print("Dataset size:", len(dataset)) + print("Instance 0:", dataset[0]) From 2b26034cc3e8ecadde39b1b16ba8884b59155673 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 11:55:12 +0200 Subject: [PATCH 006/152] Rename datasets to dataset --- cpmpy/tools/{datasets => dataset}/_base.py | 0 cpmpy/tools/{datasets => dataset}/model/mse.py | 0 cpmpy/tools/{datasets => dataset}/model/opb.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename cpmpy/tools/{datasets => dataset}/_base.py (100%) rename cpmpy/tools/{datasets => dataset}/model/mse.py (100%) rename cpmpy/tools/{datasets => dataset}/model/opb.py (100%) diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/dataset/_base.py similarity index 100% rename from cpmpy/tools/datasets/_base.py rename to cpmpy/tools/dataset/_base.py diff --git a/cpmpy/tools/datasets/model/mse.py b/cpmpy/tools/dataset/model/mse.py similarity index 100% rename from cpmpy/tools/datasets/model/mse.py rename to cpmpy/tools/dataset/model/mse.py diff --git a/cpmpy/tools/datasets/model/opb.py b/cpmpy/tools/dataset/model/opb.py similarity index 100% rename from cpmpy/tools/datasets/model/opb.py rename to cpmpy/tools/dataset/model/opb.py From e238c2934c3f5127f3c0e6ab8766a25814941fce Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 13:45:48 +0200 Subject: [PATCH 007/152] Dataset specific 'open' --- cpmpy/tools/dataset/_base.py | 4 ++++ cpmpy/tools/dataset/model/mse.py | 5 ++++- cpmpy/tools/dataset/model/opb.py | 4 ++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index 3c5338489..ce2206110 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -45,6 +45,10 @@ def category(self): def download(self, *args, **kwargs): pass + @abstractmethod + def open(self, instance): + pass + def metadata(self, file): metadata = self.category() | { 'name': pathlib.Path(file).stem.replace(self.extension, ''), diff --git a/cpmpy/tools/dataset/model/mse.py b/cpmpy/tools/dataset/model/mse.py index a749d75d0..84e8c5dfa 100644 --- a/cpmpy/tools/dataset/model/mse.py +++ b/cpmpy/tools/dataset/model/mse.py @@ -4,6 +4,8 @@ https://maxsat-evaluations.github.io/ """ +import lzma +import os import zipfile import pathlib from urllib.request import urlretrieve @@ -11,7 +13,6 @@ from .._base import _Dataset - class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible """ MaxSAT Evaluation (MSE) benchmark dataset. @@ -97,6 +98,8 @@ def download(self): # Clean up the zip file zip_path.unlink() + def open(self, instance: os.PathLike) -> callable: + return lzma.open if str(instance).endswith(".xz") else open if __name__ == "__main__": dataset = MSEDataset(year=2024, track="exact-weighted", download=True) diff --git a/cpmpy/tools/dataset/model/opb.py b/cpmpy/tools/dataset/model/opb.py index d3602954c..bc051d784 100644 --- a/cpmpy/tools/dataset/model/opb.py +++ b/cpmpy/tools/dataset/model/opb.py @@ -4,6 +4,7 @@ https://www.cril.univ-artois.fr/PB25/ """ +import lzma import os import pathlib from urllib.request import urlretrieve @@ -129,6 +130,9 @@ def download(self): # Clean up the tar file tar_path.unlink() + def open(self, instance: os.PathLike) -> callable: + return lzma.open if str(instance).endswith(".xz") else open + if __name__ == "__main__": dataset = OPBDataset(year=2024, track="DEC-LIN", download=True) print("Dataset size:", len(dataset)) From 669875acbcfc5fa43a1ab139c92fcd9b4c5badf1 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 13:46:22 +0200 Subject: [PATCH 008/152] Dataset module init file --- cpmpy/tools/dataset/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 cpmpy/tools/dataset/__init__.py diff --git a/cpmpy/tools/dataset/__init__.py b/cpmpy/tools/dataset/__init__.py new file mode 100644 index 000000000..e69de29bb From c1bd2fef45bbb4a39413794fbfcfbd551fe54db2 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 13:48:36 +0200 Subject: [PATCH 009/152] Add benchmark runners --- cpmpy/tools/benchmark/__init__.py | 0 cpmpy/tools/benchmark/_base.py | 496 ++++++++++++++++++++++++++++++ cpmpy/tools/benchmark/mse.py | 205 ++++++++++++ cpmpy/tools/benchmark/opb.py | 197 ++++++++++++ cpmpy/tools/benchmark/runner.py | 287 +++++++++++++++++ 5 files changed, 1185 insertions(+) create mode 100644 cpmpy/tools/benchmark/__init__.py create mode 100644 cpmpy/tools/benchmark/_base.py create mode 100644 cpmpy/tools/benchmark/mse.py create mode 100644 cpmpy/tools/benchmark/opb.py create mode 100644 cpmpy/tools/benchmark/runner.py diff --git a/cpmpy/tools/benchmark/__init__.py b/cpmpy/tools/benchmark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cpmpy/tools/benchmark/_base.py b/cpmpy/tools/benchmark/_base.py new file mode 100644 index 000000000..85119a822 --- /dev/null +++ b/cpmpy/tools/benchmark/_base.py @@ -0,0 +1,496 @@ +""" +Benchmark framework for CPMpy models. + +This module provides the `Benchmark` base class, designed to run constraint programming +benchmarks in a structured fashion. It allows reading instances, posting them to different +back-end solvers, and handling solver execution with limits on time and memory. +It also provides hooks for customizing logging, intermediate solution printing, and +error handling. Although this base class can be used on its own (example below), +users will most likely want to have a look at one of its subclasses for running a specific +benchmark dataset, e.g. xcsp3, opb, mse, ... + + +Usage Example +------------- +>>> from myparser import read_instance # your custom model parser (or one included in CPMpy) +>>> bm = Benchmark(reader=read_instance) +>>> bm.run( +... instance="example.extension", # your benchmark instance (e.g. coming from a CPMpy model dataset) +... solver="ortools", +... time_limit=30, +... mem_limit=1024, +... verbose=True +... ) +Status: OPTIMAL +Objective: 42 +Solution: ... + +""" + + +from abc import ABC + +import time +import random +import psutil +import warnings +from typing import Optional + +import cpmpy as cp +from cpmpy.tools.benchmark import _mib_as_bytes, _wall_time, set_memory_limit, set_time_limit, _bytes_as_mb, _bytes_as_gb + + +class Benchmark(ABC): + """ + Abstract base class for running CPMpy benchmarks. + + The `Benchmark` class provides a standardized framework for reading instances, + posting models to solvers, and managing solver runs with resource limits. + It is designed to be extended or customized for specific benchmarking needs. + """ + + def __init__(self, reader:callable): + """ + Arguments: + reader (callable): A parser from a model format to a CPMPy model. + """ + self.reader = reader + + def read_instance(self, instance) -> cp.Model: + """ + Parse a model instance to a CPMpy model. + + Arguments: + instance (str or os.PathLike): The model instance to parse into a CPMpy model. + """ + return self.reader(instance) + + """ + Callback methods which can be overwritten to make a custom benchmark run. + """ + + def print_comment(self, comment:str): + print(comment) + + def print_intermediate(self, objective:int): + print("Intermediate solution:", objective) + + def print_result(self, s): + self.print_comment(s.status()) + + def handle_memory_error(self, mem_limit): + self.print_comment(f"MemoryError raised. Reached limit of {mem_limit} MiB") + + def handle_not_implemented(self, e): + self.print_comment(str(e)) + + def handle_exception(self, e): + self.print_comment(f"An {type(e)} got raised: {e}") + import traceback + self.print_comment("Stack trace:") + for line in traceback.format_exc().split('\n'): + if line.strip(): + self.print_comment(line) + + """ + Solver arguments (can also be tweaked for a specific benchmark). + """ + + def ortools_arguments( + self, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + # https://github.com/google/or-tools/blob/stable/ortools/sat/sat_parameters.proto + res = dict() + + # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 + res |= { + "interleave_search": True, + "use_rins_lns": False, + } + if not model.has_objective(): + res |= { "num_violation_ls": 1 } + + if cores is not None: + res |= { "num_search_workers": cores } + if seed is not None: + res |= { "random_seed": seed } + + if intermediate and model.has_objective(): + # Define custom ORT solution callback, then register it + _self = self + from ortools.sat.python import cp_model as ort + class OrtSolutionCallback(ort.CpSolverSolutionCallback): + """ + For intermediate objective printing. + """ + + def __init__(self): + super().__init__() + self.__start_time = time.time() + self.__solution_count = 1 + + def on_solution_callback(self): + """Called on each new solution.""" + + current_time = time.time() + obj = int(self.ObjectiveValue()) + _self.print_comment('Solution %i, time = %0.2fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.print_intermediate(obj) + self.__solution_count += 1 + + + def solution_count(self): + """Returns the number of solutions found.""" + return self.__solution_count + + # Register the callback + res |= { "solution_callback": OrtSolutionCallback() } + + def internal_options(solver: "CPM_ortools"): + # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 + solver.ort_solver.parameters.subsolvers.extend(["default_lp", "max_lp", "quick_restart"]) + if not model.has_objective(): + solver.ort_solver.parameters.subsolvers.append("core_or_no_lp") + if len(solver.ort_model.proto.search_strategy) != 0: + solver.ort_solver.parameters.subsolvers.append("fixed") + + return res, internal_options + + def exact_arguments( + self, + seed: Optional[int] = None, + **kwargs + ): + # Documentation: https://gitlab.com/JoD/exact/-/blob/main/src/Options.hpp?ref_type=heads + res = dict() + if seed is not None: + res |= { "seed": seed } + + return res, None + + def choco_arguments(): + # Documentation: https://github.com/chocoteam/pychoco/blob/master/pychoco/solver.py + return {}, None + + def z3_arguments( + self, + model: cp.Model, + cores: int = 1, + seed: Optional[int] = None, + mem_limit: Optional[int] = None, + **kwargs + ): + # Documentation: https://microsoft.github.io/z3guide/programming/Parameters/ + # -> is outdated, just let it crash and z3 will report the available options + + res = dict() + + if model.has_objective(): + # Opt does not seem to support setting random seed or max memory + pass + else: + # Sat parameters + if cores is not None: + res |= { "threads": cores } # TODO what with hyperthreadding, when more threads than cores + if seed is not None: + res |= { "random_seed": seed } + if mem_limit is not None: + res |= { "max_memory": _bytes_as_mb(mem_limit) } + + return res, None + + def minizinc_arguments( + self, + solver: str, + cores: Optional[int] = None, + seed: Optional[int] = None, + **kwargs + ): + # Documentation: https://minizinc-python.readthedocs.io/en/latest/api.html#minizinc.instance.Instance.solve + res = dict() + if cores is not None: + res |= { "processes": cores } + if seed is not None: + res |= { "random_seed": seed } + + #if solver.endswith("gecode"): + # Documentation: https://www.minizinc.org/doc-2.4.3/en/lib-gecode.html + #elif solver.endswith("chuffed"): + # Documentation: + # - https://www.minizinc.org/doc-2.5.5/en/lib-chuffed.html + # - https://github.com/chuffed/chuffed/blob/develop/chuffed/core/options.h + + return res, None + + def gurobi_arguments( + self, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + mem_limit: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + # Documentation: https://www.gurobi.com/documentation/9.5/refman/parameters.html#sec:Parameters + res = dict() + if cores is not None: + res |= { "Threads": cores } + if seed is not None: + res |= { "Seed": seed } + if mem_limit is not None: + res |= { "MemLimit": _bytes_as_gb(mem_limit) } + + if intermediate and model.has_objective(): + + _self = self + + class GurobiSolutionCallback: + def __init__(self, model:cp.Model): + self.__start_time = time.time() + self.__solution_count = 0 + self.model = model + + def callback(self, *args, **kwargs): + current_time = time.time() + model, state = args + + # Callback codes: https://www.gurobi.com/documentation/current/refman/cb_codes.html#sec:CallbackCodes + + from gurobipy import GRB + # if state == GRB.Callback.MESSAGE: # verbose logging + # print_comment("log message: " + str(model.cbGet(GRB.Callback.MSG_STRING))) + if state == GRB.Callback.MIP: # callback from the MIP solver + if model.cbGet(GRB.Callback.MIP_SOLCNT) > self.__solution_count: # do we have a new solution? + + obj = int(model.cbGet(GRB.Callback.MIP_OBJBST)) + _self.print_comment('Solution %i, time = %0.2fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.print_intermediate(obj) + self.__solution_count = model.cbGet(GRB.Callback.MIP_SOLCNT) + + res |= { "solution_callback": GurobiSolutionCallback(model).callback } + + return res, None + + def cpo_arguments( + self, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + # Documentation: https://ibmdecisionoptimization.github.io/docplex-doc/cp/docplex.cp.parameters.py.html#docplex.cp.parameters.CpoParameters + res = dict() + if cores is not None: + res |= { "Workers": cores } + if seed is not None: + res |= { "RandomSeed": seed } + + if intermediate and model.has_objective(): + from docplex.cp.solver.solver_listener import CpoSolverListener + _self = self + class CpoSolutionCallback(CpoSolverListener): + + def __init__(self): + super().__init__() + self.__start_time = time.time() + self.__solution_count = 1 + + def result_found(self, solver, sres): + current_time = time.time() + obj = sres.get_objective_value() + if obj is not None: + _self.print_comment('Solution %i, time = %0.2fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.print_intermediate(obj) + self.__solution_count += 1 + + def solution_count(self): + """Returns the number of solutions found.""" + return self.__solution_count + + # Register the callback + res |= { "solution_callback": CpoSolutionCallback } + + return res, None + + + """ + Methods which can, bit most likely shouldn't, be overwritten. + """ + + def set_memory_limit(self, mem_limit, verbose=False): + set_memory_limit(mem_limit, verbose=verbose) + + def set_time_limit(self, time_limit, verbose=False): + p = psutil.Process() + if time_limit is not None: + set_time_limit(int(time_limit - _wall_time(p) + time.process_time()), verbose=verbose) + else: + set_time_limit(None) + + def post_model(self, model, solver, solver_args): + """ + Post the model to the selected backend solver. + """ + if solver == "exact": # Exact2 takes its options at creation time + s = cp.SolverLookup.get(solver, model, **solver_args) + solver_args = dict() # no more solver args needed + else: + s = cp.SolverLookup.get(solver, model) + return s + + + """ + Internal workings + """ + + def solver_arguments( + self, + solver: str, + model: cp.Model, + seed: Optional[int] = None, + intermediate: bool = False, + cores: int = 1, + mem_limit: Optional[int] = None, + **kwargs + ): + opt = model.has_objective() + sat = not opt + + if solver == "ortools": + return self.ortools_arguments(model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) + elif solver == "exact": + return self.exact_arguments(seed=seed, **kwargs) + elif solver == "choco": + return self.choco_arguments() + elif solver == "z3": + return self.z3_arguments(model, cores=cores, seed=seed, mem_limit=mem_limit, **kwargs) + elif solver.startswith("minizinc"): # also can have a subsolver + return self.minizinc_arguments(solver, cores=cores, seed=seed, **kwargs) + elif solver == "gurobi": + return self.gurobi_arguments(model, cores=cores, seed=seed, mem_limit=mem_limit, intermediate=intermediate, opt=opt, **kwargs) + elif solver == "cpo": + return self.cpo_arguments(model=model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) + else: + self.print_comment(f"setting parameters of {solver} is not (yet) supported") + return dict() + + def run( + self, + instance:str, # path to the instance to run + open:Optional[callable] = None, # how to 'open' the instance file + seed: Optional[int] = None, # random seed + time_limit: Optional[int] = None, # time limit for this single instance + mem_limit: Optional[int] = None, # MiB: 1024 * 1024 bytes + cores: int = 1, + solver: str = None, # which backend solver to use + time_buffer: int = 0, + intermediate: bool = False, + verbose: bool = False, + **kwargs, + ): + + if not verbose: + warnings.filterwarnings("ignore") + + try: + + # --------------------------- Global Configuration --------------------------- # + + # Get the current process + p = psutil.Process() + + # pychoco currently does not support setting the mem_limit + if solver == "choco" and mem_limit is not None: + warnings.warn("'mem_limit' is currently not supported with choco, issues with GraalVM") + mem_limit = None + + # Set random seed (if provided) + if seed is not None: + random.seed(seed) + + # Set memory limit (if provided) + if mem_limit is not None: + self.set_memory_limit(mem_limit, verbose=verbose) + + # Set time limit (if provided) + if time_limit is not None: + self.set_time_limit(time_limit, verbose=verbose) # set remaining process time != wall time + + # ------------------------------ Parse instance ------------------------------ # + + time_parse = time.time() + model = self.read_instance(instance, open=open) + time_parse = time.time() - time_parse + if verbose: self.print_comment(f"took {time_parse:.4f} seconds to parse model [{instance}]") + + if time_limit and time_limit < _wall_time(p): + raise TimeoutError("Time's up after parse") + + # ------------------------ Post CPMpy model to solver ------------------------ # + + solver_args, internal_options = self.solver_arguments(solver, model=model, seed=seed, + intermediate=intermediate, + cores=cores, mem_limit=_mib_as_bytes(mem_limit) if mem_limit is not None else None, + **kwargs) + + # Post model to solver + time_post = time.time() + s = self.post_model(model, solver, solver_args) + time_post = time.time() - time_post + if verbose: self.print_comment(f"took {time_post:.4f} seconds to post model to {solver}") + + if time_limit and time_limit < _wall_time(p): + raise TimeoutError("Time's up after post") + + # ------------------------------- Solve model ------------------------------- # + + if time_limit: + # give solver only the remaining time + time_limit = time_limit - _wall_time(p) - time_buffer + # disable signal-based time limit and let the solver handle it (solvers don't play well with difference between cpu and wall time) + self.set_time_limit(None) + + if verbose: self.print_comment(f"{time_limit}s left to solve") + + time_solve = time.time() + try: + if internal_options is not None: + internal_options(s) # Set more internal solver options (need access to native solver object) + is_sat = s.solve(time_limit=time_limit, **solver_args) + except RuntimeError as e: + if "Program interrupted by user." in str(e): # Special handling for Exact + raise TimeoutError("Exact interrupted due to timeout") + else: + raise e + + time_solve = time.time() - time_solve + if verbose: self.print_comment(f"took {time_solve:.4f} seconds to solve") + + # ------------------------------- Print result ------------------------------- # + + self.print_result(s) + + # ------------------------------------- - ------------------------------------ # + + + except MemoryError as e: + self.handle_memory_error(mem_limit) + raise e + except NotImplementedError as e: + self.handle_not_implemented(e) + raise e + except Exception as e: + self.handle_exception(e) + raise e + + + \ No newline at end of file diff --git a/cpmpy/tools/benchmark/mse.py b/cpmpy/tools/benchmark/mse.py new file mode 100644 index 000000000..08084b645 --- /dev/null +++ b/cpmpy/tools/benchmark/mse.py @@ -0,0 +1,205 @@ +""" +MSE competition as a CPMpy benchmark + +This module provides a benchmarking framework for running CPMpy on MaxSAT Evaluation (MSE) +competition instances encoded in WCNF (Weighted CNF) format. It extends the generic +`Benchmark` base class with MSE-specific logging and result reporting in DIMACS-like format. + +Command-line Interface +---------------------- +This script can be run directly to benchmark solvers on MSE datasets. + +Usage: + python mse.py --year 2024 --track exact-weighted --solver ortools + +Arguments: + --year Competition year (e.g., 2024). + --track Track type (e.g., exact-weighted, exact-unweighted). + --solver Solver name (e.g., ortools, exact, choco, ...). + --workers Number of parallel workers to use. + --time-limit Time limit in seconds per instance. + --mem-limit Memory limit in MB per instance. + --cores Number of cores to assign to a single instance. + --output-dir Output directory for CSV files. + --verbose Show solver output if set. + --intermediate Report intermediate solutions if supported. + +=============== +List of classes +=============== + +.. autosummary:: + :nosignatures: + + MSEBenchmark + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + solution_mse +""" + +import argparse +from datetime import datetime +from enum import Enum +from pathlib import Path +import warnings + +from cpmpy.tools.benchmark.runner import benchmark_runner +from _base import Benchmark + +from cpmpy.tools.wcnf import read_wcnf +from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus + + +class ExitStatus(Enum): + unsupported:str = "UNSUPPORTED" # instance contains an unsupported feature (e.g. a unsupported global constraint) + sat:str = "SATISFIABLE" # CSP : found a solution | COP : found a solution but couldn't prove optimality + optimal:str = "OPTIMUM" + chr(32) + "FOUND" # optimal COP solution found + unsat:str = "UNSATISFIABLE" # instance is unsatisfiable + unknown:str = "UNKNOWN" # any other case + +def solution_mse(model): + """ + Convert a CPMpy model solution into the MSE solution string format. + + Arguments: + model (cp.solvers.SolverInterface): The solver-specific model for which to print its solution in MSE format. + + Returns: + str: MSE-formatted solution string. + """ + variables = [var for var in model.user_vars if var.name[:2] == "BV"] # dirty workaround for all missed aux vars in user vars + variables = sorted(variables, key=lambda v: int("".join(filter(str.isdigit, v.name)))) + return " ".join([str(1 if var.value() else 0) for var in variables]) + +class MSEBenchmark(Benchmark): + + """ + Benchmark runner for MSE (MaxSAT Evaluation) competition instances. + + This class extends `Benchmark` to implement MSE-specific solution printing + in DIMACS-like output format (`c`, `s`, `v`, `o` lines). It uses CPMpy's `read_wcnf` + to parse WCNF (Weighted CNF) instances and runs them on a selected solver supported + by CPMpy. + """ + + def __init__(self): + super().__init__(reader=read_wcnf) + + def print_comment(self, comment:str): + print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) + + def print_status(self, status: ExitStatus) -> None: + print('s' + chr(32) + status.value, end="\n", flush=True) + + def print_value(self, value: str) -> None: + value = value[:-2].replace("\n", "\nv" + chr(32)) + value[-2:] + print('v' + chr(32) + value, end="\n", flush=True) + + def print_objective(self, objective: int) -> None: + print('o' + chr(32) + str(objective), end="\n", flush=True) + + def print_result(self, s): + if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_value(solution_mse(s)) + self.print_status(ExitStatus.optimal) + elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_value(solution_mse(s)) + self.print_status(ExitStatus.sat) + elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: + self.print_status(ExitStatus.unsat) + else: + self.print_comment("Solver did not find any solution within the time/memory limit") + self.print_status(ExitStatus.unknown) + + def handle_memory_error(self, mem_limit): + super().handle_memory_error(mem_limit) + self.print_status(ExitStatus.unknown) + + def handle_not_implemented(self, e): + super().handle_not_implemented(e) + self.print_status(ExitStatus.unsupported) + + def handle_exception(self, e): + super().handle_exception(e) + self.print_status(ExitStatus.unknown) + + def parse_output_line(self, line, result): + if line.startswith('s '): + result['status'] = line[2:].strip() + elif line.startswith('v '): + # only record first line, contains 'type' and 'cost' + solution = line.split("\n")[0][2:].strip() + if solution not in result: + result['solution'] = solution + else: + result['solution'] = result['solution'] + ' ' + str(solution) + elif line.startswith('o '): + obj = int(line[2:].strip()) + if result['intermediate'] is None: + result['intermediate'] = [] + result['intermediate'] += [(sol_time, obj)] + result['objective_value'] = obj + obj = None + elif line.startswith('c Solution'): + parts = line.split(', time = ') + # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines + sol_time = float(parts[-1].replace('s', '').rstrip()) + elif line.startswith('c took '): + # Parse timing information + parts = line.split(' seconds to ') + if len(parts) == 2: + time_val = float(parts[0].replace('c took ', '')) + action = parts[1].strip() + if action.startswith('parse'): + result['time_parse'] = time_val + elif action.startswith('convert'): + result['time_model'] = time_val + elif action.startswith('post'): + result['time_post'] = time_val + elif action.startswith('solve'): + result['time_solve'] = time_val + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Benchmark solvers on MSE instances') + parser.add_argument('--year', type=int, required=True, help='Competition year (e.g., 2024)') + parser.add_argument('--track', type=str, required=True, help='Track type (e.g., exact-weighted, exact-unweighted)') + parser.add_argument('--solver', type=str, required=True, help='Solver name (e.g., ortools, exact, choco, ...)') + parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers') + parser.add_argument('--time-limit', type=int, default=300, help='Time limit in seconds per instance') + parser.add_argument('--mem-limit', type=int, default=8192, help='Memory limit in MB per instance') + parser.add_argument('--cores', type=int, default=1, help='Number of cores to assign tp a single instance') + parser.add_argument('--output-dir', type=str, default='results', help='Output directory for CSV files') + parser.add_argument('--verbose', action='store_true', help='Show solver output') + parser.add_argument('--intermediate', action='store_true', help='Report on intermediate solutions') + # parser.add_argument('--checker-path', type=str, default=None, + # help='Path to the XCSP3 solution checker JAR file') + args = parser.parse_args() + + if not args.verbose: + warnings.filterwarnings("ignore") + + # Load benchmark instances (as a dataset) + from cpmpy.tools.dataset.model.mse import MSEDataset + dataset = MSEDataset(year=args.year, track=args.track, download=True) + + # Create output directory + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Get current timestamp in a filename-safe format + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Define output file path with timestamp + output_file = str(output_dir / "mse" / f"mse_{args.year}_{args.track}_{args.solver}_{timestamp}.csv") + + # Run the benchmark + instance_runner = MSEBenchmark() + output_file = benchmark_runner(dataset=dataset, instance_runner=instance_runner, output_file=output_file, **vars(args)) + print(f"Results added to {output_file}") diff --git a/cpmpy/tools/benchmark/opb.py b/cpmpy/tools/benchmark/opb.py new file mode 100644 index 000000000..3fc5202cd --- /dev/null +++ b/cpmpy/tools/benchmark/opb.py @@ -0,0 +1,197 @@ +""" +PB competition as a CPMpy benchmark + +This module provides a benchmarking framework for running CPMpy on PB +competition instances. It extends the generic `Benchmark` base class with +PB Competition-specific logging and result reporting. + +Command-line Interface +---------------------- +This script can be run directly to benchmark solvers on MSE datasets. + +Usage: + python opb.py --year 2024 --track OPT-LIN --solver ortools + +Arguments: + --year Competition year (e.g., 2024). + --track Track type (e.g., OPT_LIN, DEC_LIN). + --solver Solver name (e.g., ortools, exact, choco, ...). + --workers Number of parallel workers to use. + --time-limit Time limit in seconds per instance. + --mem-limit Memory limit in MB per instance. + --cores Number of cores to assign to a single instance. + --output-dir Output directory for CSV files. + --verbose Show solver output if set. + --intermediate Report intermediate solutions if supported. + +=============== +List of classes +=============== + +.. autosummary:: + :nosignatures: + + OPBBenchmark + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + solution_opb +""" + +import warnings +import argparse +from enum import Enum +from pathlib import Path +from datetime import datetime + +# CPMpy +from cpmpy.tools.benchmark.runner import benchmark_runner +from cpmpy.tools.benchmark._base import Benchmark +from cpmpy.tools.opb import read_opb +from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus + + +class ExitStatus(Enum): + unsupported:str = "UNSUPPORTED" # instance contains an unsupported feature (e.g. a unsupported global constraint) + sat:str = "SATISFIABLE" # CSP : found a solution | COP : found a solution but couldn't prove optimality + optimal:str = "OPTIMUM" + chr(32) + "FOUND" # optimal COP solution found + unsat:str = "UNSATISFIABLE" # instance is unsatisfiable + unknown:str = "UNKNOWN" # any other case + +def solution_opb(model): + """ + Formats a solution according to the PB24 specification. + + Arguments: + model: CPMpy model for which to format its solution (should be solved first) + + Returns: + Formatted model solution according to PB24 specification. + """ + variables = [var for var in model.user_vars if var.name[:2] not in ["IV", "BV", "B#"]] # dirty workaround for all missed aux vars in user vars TODO + return " ".join([var.name.replace("[","").replace("]","") if var.value() else "-"+var.name.replace("[","").replace("]","") for var in variables]) + +class OPBBenchmark(Benchmark): + """ + The PB competition as a CPMpy benchmark. + """ + + def __init__(self): + super().__init__(reader=read_opb) + + def print_comment(self, comment:str): + print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) + + def print_status(self, status: ExitStatus) -> None: + print('s' + chr(32) + status.value, end="\n", flush=True) + + def print_value(self, value: str) -> None: + value = value[:-2].replace("\n", "\nv" + chr(32)) + value[-2:] + print('v' + chr(32) + value, end="\n", flush=True) + + def print_objective(self, objective: int) -> None: + print('o' + chr(32) + str(objective), end="\n", flush=True) + + def print_result(self, s): + if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_result() + self.print_value(solution_opb(s)) + self.print_status(ExitStatus.optimal) + elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_value(solution_opb(s)) + self.print_status(ExitStatus.sat) + elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: + self.print_status(ExitStatus.unsat) + else: + self.print_comment("Solver did not find any solution within the time/memory limit") + self.print_status(ExitStatus.unknown) + + def handle_memory_error(self, mem_limit): + super().handle_memory_error(mem_limit) + self.print_status(ExitStatus.unknown) + + def handle_not_implemented(self, e): + super().handle_not_implemented(e) + self.print_status(ExitStatus.unsupported) + + def handle_exception(self, e): + super().handle_exception(e) + self.print_status(ExitStatus.unknown) + + def parse_output_line(self, line, result): + if line.startswith('s '): + result['status'] = line[2:].strip() + elif line.startswith('v '): + # only record first line, contains 'type' and 'cost' + solution = line.split("\n")[0][2:].strip() + if solution not in result: + result['solution'] = solution + else: + result['solution'] = result['solution'] + ' ' + str(solution) + elif line.startswith('o '): + obj = int(line[2:].strip()) + if result['intermediate'] is None: + result['intermediate'] = [] + result['intermediate'] += [(sol_time, obj)] + result['objective_value'] = obj + obj = None + elif line.startswith('c Solution'): + parts = line.split(', time = ') + # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines + sol_time = float(parts[-1].replace('s', '').rstrip()) + elif line.startswith('c took '): + # Parse timing information + parts = line.split(' seconds to ') + if len(parts) == 2: + time_val = float(parts[0].replace('c took ', '')) + action = parts[1].strip() + if action.startswith('parse'): + result['time_parse'] = time_val + elif action.startswith('convert'): + result['time_model'] = time_val + elif action.startswith('post'): + result['time_post'] = time_val + elif action.startswith('solve'): + result['time_solve'] = time_val + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Benchmark solvers on OPB instances') + parser.add_argument('--year', type=int, required=True, help='Competition year (e.g., 2023)') + parser.add_argument('--track', type=str, required=True, help='Track type (e.g., OPT-LIN, DEC-LIN)') + parser.add_argument('--solver', type=str, required=True, help='Solver name (e.g., ortools, exact, choco, ...)') + parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers') + parser.add_argument('--time-limit', type=int, default=300, help='Time limit in seconds per instance') + parser.add_argument('--mem-limit', type=int, default=8192, help='Memory limit in MB per instance') + parser.add_argument('--cores', type=int, default=1, help='Number of cores to assign tp a single instance') + parser.add_argument('--output-dir', type=str, default='results', help='Output directory for CSV files') + parser.add_argument('--verbose', action='store_true', help='Show solver output') + parser.add_argument('--intermediate', action='store_true', help='Report on intermediate solutions') + args = parser.parse_args() + + if not args.verbose: + warnings.filterwarnings("ignore") + + # Load benchmark instances (as a dataset) + from cpmpy.tools.dataset.model.opb import OPBDataset + dataset = OPBDataset(year=args.year, track=args.track, download=True) + + # Create output directory + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Get current timestamp in a filename-safe format + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Define output file path with timestamp + output_file = str(output_dir / "opb" / f"opb_{args.year}_{args.track}_{args.solver}_{timestamp}.csv") + + # Run the benchmark + instance_runner = OPBBenchmark() + output_file = benchmark_runner(dataset=dataset, instance_runner=instance_runner, output_file=output_file, **vars(args)) + print(f"Results added to {output_file}") diff --git a/cpmpy/tools/benchmark/runner.py b/cpmpy/tools/benchmark/runner.py new file mode 100644 index 000000000..325ac54cd --- /dev/null +++ b/cpmpy/tools/benchmark/runner.py @@ -0,0 +1,287 @@ +""" +Benchmark Runner for CPMpy Instances + +This module provides tools to execute benchmark instances in parallel while +safely capturing solver output, enforcing time and memory limits, and +writing structured results to a CSV file. The included functions should not +be used directly, but rather through one of the available benchmarks. + +Key Features +------------ +- Supports running multiple instances in parallel using threads. +- Executes each instance in a separate subprocess for isolation. +- Forwards stdout to both console and parent process, preserving output. +- Handles timeouts and SIGTERM/SIGKILL signals gracefully. +- Writes results to a CSV file. +- Optional reporting of intermediate solutions and solution checking. +""" + +import csv +from io import StringIO +import os +import signal +import time +import sys +import warnings +import traceback +import multiprocessing +from tqdm import tqdm +from typing import Optional, Tuple +from filelock import FileLock +from concurrent.futures import ThreadPoolExecutor + +from cpmpy.tools.xcsp3.xcsp3_cpmpy import xcsp3_cpmpy, init_signal_handlers, ExitStatus + +class Tee: + """ + A stream-like object that duplicates writes to multiple underlying streams. + """ + def __init__(self, *streams): + """ + Arguments: + *streams: Any number of file-like objects that implement a write() method, + such as sys.stdout, sys.stderr, or StringIO. + """ + self.streams = streams + + def write(self, data): + """ + Write data to all underlying streams. + + Args: + data (str): The string to write. + """ + for s in self.streams: + s.write(data) + + def flush(self): + """ + Flush all underlying streams to ensure all data is written out. + """ + for s in self.streams: + s.flush() + +class PipeWriter: + """ + Stdout wrapper for a multiprocessing pipe. + """ + def __init__(self, conn): + self.conn = conn + def write(self, data): + if data: # avoid empty writes + try: + self.conn.send(data) + except: + pass + def flush(self): + pass # no buffering + + +def wrapper(instance_runner, conn, kwargs, verbose): + """ + Wraps a call to a benchmark as to correctly + forward stdout to the multiprocessing pipe (conn). + Also sends a last status report though the pipe. + + Status report can be missing when process has been terminated by a SIGTERM. + """ + + original_stdout = sys.stdout + pipe_writer = PipeWriter(conn) + + if not verbose: + warnings.filterwarnings("ignore") + sys.stdout = pipe_writer # only forward to pipe + else: + sys.stdout = Tee(original_stdout, pipe_writer) # forward to pipe and console + + try: + init_signal_handlers() # configure OS signal handlers + instance_runner.run(**kwargs) + conn.send({"status": "ok"}) + except Exception as e: # capture exceptions and report in state + tb_str = traceback.format_exc() + conn.send({"status": "error", "exception": e, "traceback": tb_str}) + finally: + sys.stdout = original_stdout + conn.close() + +# exec_args = (instance_runner, filename, metadata, open, solver, time_limit, mem_limit, output_file, verbose) +def execute_instance(args: Tuple[callable, str, dict, callable, str, int, int, int, str, bool, bool, str]) -> None: + """ + Solve a single benchmark instance and write results to file immediately. + + Args is a list of: + filename: Path to the instance file + metadata: Dictionary containing instance metadata (year, track, name) + solver: Name of the solver to use + time_limit: Time limit in seconds + mem_limit: Memory limit in MB + output_file: Path to the output CSV file + verbose: Whether to show solver output + """ + + instance_runner, filename, metadata, open, solver, time_limit, mem_limit, cores, output_file, verbose, intermediate, checker_path = args + + # Fieldnames for the CSV file + fieldnames = ['instance'] + list(metadata.keys()) + \ + ['solver', + 'time_total', 'time_parse', 'time_model', 'time_post', 'time_solve', + 'status', 'objective_value', 'solution', 'intermediate', 'checker_result'] + result = dict.fromkeys(fieldnames) # init all fields to None + for k in metadata.keys(): + result[k] = metadata[k] + result['solver'] = solver + + # Decompress before timers start + with open(filename) as f: # <- dataset-specific 'open' callable + filename = StringIO(f.read()) # read to memory-mapped file + + # Start total timing + total_start = time.time() + + # Call xcsp3 in separate process + ctx = multiprocessing.get_context("spawn") + parent_conn, child_conn = multiprocessing.Pipe() # communication pipe between processes + process = ctx.Process(target=wrapper, args=( + instance_runner, + child_conn, + { + "instance": filename, + "solver": solver, + "time_limit": time_limit, + "mem_limit": mem_limit, + "intermediate": intermediate, + "force_mem_limit": True, + "time_buffer": 1, + "cores": cores, + }, + verbose)) + process.start() + process.join(timeout=time_limit) + + # Replicate competition convention on how jobs get terminated + if process.is_alive(): + # Send sigterm to let process know it reached its time limit + os.kill(process.pid, signal.SIGTERM) + # 1 second grace period + process.join(timeout=1) + # Kill if still alive + if process.is_alive(): + os.kill(process.pid, signal.SIGKILL) + process.join() + + result['time_total'] = time.time() - total_start + + # Default status if nothing returned by subprocess + # -> process exited prematurely due to sigterm + status = {"status": "error", "exception": "sigterm"} + + # Parse the output to get status, solution and timings + while parent_conn.poll(timeout=1): + line = parent_conn.recv() + + # Received a print statement from the subprocess + if isinstance(line, str): + instance_runner.parse_output_line(line, result) + + # Received a new status from the subprocess + elif isinstance(line, dict): + status = line + + else: + raise() + + # Parse the exit status + if status["status"] == "error": + # Ignore timeouts + if "TimeoutError" in repr(status["exception"]): + pass + # All other exceptions, put in solution field + elif result['solution'] is None: + result['status'] = ExitStatus.unknown.value + result["solution"] = status["exception"] + + # if checker_path is not None and complete_solution is not None: TODO: generalise 'checkers' for benchmarks + # checker_output, checker_time = run_solution_checker( + # JAR=checker_path, + # instance_location=file_path, + # out_file="'" + complete_solution.replace("\n\r", " ").replace("\n", " ").replace("v ", "").replace("v ", "")+ "'", + # verbose=verbose, + # cpm_time=result.get('time_solve', 0) # or total solve time you have + # ) + + # if checker_output is not None: + # result['checker_result'] = checker_output + # else: + # result['checker_result'] = None + + # Use a lock file to prevent concurrent writes + lock_file = f"{output_file}.lock" + lock = FileLock(lock_file) + try: + with lock: + # Pre-check if file exists to determine if we need to write header + write_header = not os.path.exists(output_file) + + with open(output_file, 'a', newline='') as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + if write_header: + writer.writeheader() + writer.writerow(result) + finally: + # Optional: cleanup if the lock file somehow persists + if os.path.exists(lock_file): + try: + os.remove(lock_file) + except Exception: + pass # avoid crashing on cleanup + + + +def benchmark_runner( + dataset, instance_runner, + output_file: str, + solver: str, workers: int = 1, + time_limit: int = 300, mem_limit: Optional[int] = 4096, cores: int=1, + verbose: bool = False, intermediate: bool = False, + checker_path: Optional[str] = None, + **kwargs + ) -> str: + """ + Run a benchmark over all instances in a dataset using multiple threads. + + Arguments: + dataset (_Dataset): Dataset object containing instances to benchmark. + instance_runner (Benchmark): Benchmark runner that implements the run() method. + output_file (str): Path to the CSV file where results will be stored. + solver (str): Name of the solver to use. + workers (int): Number of parallel processes to run instances (default=1). + time_limit (int): Time limit in seconds for each instance (default=300). + mem_limit (int, optional): Memory limit in MB per instance (default=4096). + cores (int): Number of CPU cores assigned per instance (default=1). + verbose (bool): Whether to show solver output in stdout (default=False). + intermediate (bool): Whether to report intermediate solutions if supported (default=False). + checker_path (str, optional): Path to a solution checker for validating instance solutions. + **kwargs: Additional arguments passed to `execute_instance`. + + Returns: + str: Path to the CSV file where benchmark results were written. + """ + + # Process instances in parallel + with ThreadPoolExecutor(max_workers=workers) as executor: + # Submit all tasks and track their futures + futures = [executor.submit(execute_instance, # below: args + (instance_runner, filename, metadata, dataset.open(), solver, time_limit, mem_limit, cores, output_file, verbose, intermediate, checker_path)) + for filename, metadata in dataset] + # Process results as they complete + for i,future in enumerate(tqdm(futures, total=len(futures), desc=f"Running {solver}")): + try: + _ = future.result(timeout=time_limit+60) # for cleanliness sake, result is empty + except TimeoutError: + pass + except Exception as e: + print(f"Job {i}: {dataset[i][1]['name']}, ProcessPoolExecutor caught: {e}") + + return output_file From 83454e00dae27aa38acffe8ee00cb60c8808d6ac Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 14:00:47 +0200 Subject: [PATCH 010/152] Formatting --- cpmpy/tools/benchmark/mse.py | 10 +++++----- cpmpy/tools/dataset/model/mse.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/cpmpy/tools/benchmark/mse.py b/cpmpy/tools/benchmark/mse.py index 08084b645..3654c2bc8 100644 --- a/cpmpy/tools/benchmark/mse.py +++ b/cpmpy/tools/benchmark/mse.py @@ -43,15 +43,15 @@ solution_mse """ +import warnings import argparse -from datetime import datetime from enum import Enum from pathlib import Path -import warnings +from datetime import datetime +# CPMpy from cpmpy.tools.benchmark.runner import benchmark_runner -from _base import Benchmark - +from cpmpy.tools.benchmark._base import Benchmark from cpmpy.tools.wcnf import read_wcnf from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus @@ -80,7 +80,7 @@ def solution_mse(model): class MSEBenchmark(Benchmark): """ - Benchmark runner for MSE (MaxSAT Evaluation) competition instances. + MSE (MaxSAT Evaluation) competition as a CPMpy benchmark. This class extends `Benchmark` to implement MSE-specific solution printing in DIMACS-like output format (`c`, `s`, `v`, `o` lines). It uses CPMpy's `read_wcnf` diff --git a/cpmpy/tools/dataset/model/mse.py b/cpmpy/tools/dataset/model/mse.py index 84e8c5dfa..711a560bb 100644 --- a/cpmpy/tools/dataset/model/mse.py +++ b/cpmpy/tools/dataset/model/mse.py @@ -4,8 +4,9 @@ https://maxsat-evaluations.github.io/ """ -import lzma + import os +import lzma import zipfile import pathlib from urllib.request import urlretrieve From 7f2d363282588ac9c298e80df5c14bb8760daf80 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 14:06:44 +0200 Subject: [PATCH 011/152] XCSP3 as dataset and benchmark --- cpmpy/tools/benchmark/xcsp3.py | 248 +++++++++++++++++++++++++++++ cpmpy/tools/dataset/model/xcsp3.py | 132 +++++++++++++++ cpmpy/tools/xcsp3/__init__.py | 137 ++-------------- cpmpy/tools/xcsp3/parser.py | 146 +++++++++++++++++ 4 files changed, 543 insertions(+), 120 deletions(-) create mode 100644 cpmpy/tools/benchmark/xcsp3.py create mode 100644 cpmpy/tools/dataset/model/xcsp3.py create mode 100644 cpmpy/tools/xcsp3/parser.py diff --git a/cpmpy/tools/benchmark/xcsp3.py b/cpmpy/tools/benchmark/xcsp3.py new file mode 100644 index 000000000..e52e41a4a --- /dev/null +++ b/cpmpy/tools/benchmark/xcsp3.py @@ -0,0 +1,248 @@ +""" +XCSP3 competition as a CPMpy benchmark + +This module provides a benchmarking framework for running CPMpy on XCSP3 +competition instances. It extends the generic `Benchmark` base class with +XCSP3-specific logging and result reporting. + +Command-line Interface +---------------------- +This script can be run directly to benchmark solvers on XCSP3 datasets. + +Usage: + python xcsp3.py --year 2024 --track CSP --solver ortools + +Arguments: + --year Competition year (e.g., 2024). + --track Track type (e.g., CSP, COP). + --solver Solver name (e.g., ortools, exact, choco, ...). + --workers Number of parallel workers to use. + --time-limit Time limit in seconds per instance. + --mem-limit Memory limit in MB per instance. + --cores Number of cores to assign to a single instance. + --output-dir Output directory for CSV files. + --verbose Show solver output if set. + --intermediate Report intermediate solutions if supported. + +=============== +List of classes +=============== + +.. autosummary:: + :nosignatures: + + XCSP3Benchmark + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + solution_xcsp3 +""" + +import warnings +import argparse +from enum import Enum +from pathlib import Path +from datetime import datetime + +# CPMpy +from cpmpy.tools.benchmark.runner import benchmark_runner +from cpmpy.tools.benchmark._base import Benchmark +from cpmpy.tools.xcsp3 import read_xcsp3 +from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus + +# PyCSP3 +from xml.etree.ElementTree import ParseError +import xml.etree.cElementTree as ET + + +class ExitStatus(Enum): + unsupported:str = "UNSUPPORTED" # instance contains an unsupported feature (e.g. a unsupported global constraint) + sat:str = "SATISFIABLE" # CSP : found a solution | COP : found a solution but couldn't prove optimality + optimal:str = "OPTIMUM" + chr(32) + "FOUND" # optimal COP solution found + unsat:str = "UNSATISFIABLE" # instance is unsatisfiable + unknown:str = "UNKNOWN" # any other case + +def solution_xcsp3(model, useless_style="*", boolean_style="int"): + """ + Formats a solution according to the XCSP3 specification. + + Arguments: + model: CPMpy model for which to format its solution (should be solved first) + useless_style: How to process unused decision variables (with value `None`). + If "*", variable is included in reporting with value "*". + If "drop", variable is excluded from reporting. + boolean_style: Print style for boolean constants. + "int" results in 0/1, "bool" results in False/True. + + Returns: + XML-formatted model solution according to XCSP3 specification. + """ + + # CSP + if not model.has_objective(): + root = ET.Element("instantiation", type="solution") + # COP + else: + root = ET.Element("instantiation", type="optimum", cost=str(int(model.objective_value()))) + + # How useless variables should be handled + # (variables which have value `None` in the solution) + variables = {var.name: var for var in model.user_vars if var.name[:2] not in ["IV", "BV", "B#"]} # dirty workaround for all missed aux vars in user vars + if useless_style == "*": + variables = {k:(v.value() if v.value() is not None else "*") for k,v in variables.items()} + elif useless_style == "drop": + variables = {k:v.value() for k,v in variables.items() if v.value() is not None} + + # Convert booleans + if boolean_style == "bool": + pass + elif boolean_style == "int": + variables = {k:(v if (not isinstance(v, bool)) else (1 if v else 0)) for k,v in variables.items()} + + # Build XCSP3 XML tree + ET.SubElement(root, "list").text=" " + " ".join([str(v) for v in variables.keys()]) + " " + ET.SubElement(root, "values").text=" " + " ".join([str(v) for v in variables.values()]) + " " + tree = ET.ElementTree(root) + ET.indent(tree, space=" ", level=0) + res = ET.tostring(root).decode("utf-8") + + return str(res) + + +class XCSP3Benchmark(Benchmark): + """ + The XCSP3 competition as a CPMpy benchmark. + """ + + def __init__(self): + super().__init__(reader=read_xcsp3) + + def print_comment(self, comment:str): + print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) + + def print_status(self, status: ExitStatus) -> None: + print('s' + chr(32) + status.value, end="\n", flush=True) + + def print_value(self, value: str) -> None: + value = value[:-2].replace("\n", "\nv" + chr(32)) + value[-2:] + print('v' + chr(32) + value, end="\n", flush=True) + + def print_objective(self, objective: int) -> None: + print('o' + chr(32) + str(objective), end="\n", flush=True) + + def print_result(self, s): + if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_result() + self.print_value(solution_xcsp3(s)) + self.print_status(ExitStatus.optimal) + elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_value(solution_xcsp3(s)) + self.print_status(ExitStatus.sat) + elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: + self.print_status(ExitStatus.unsat) + else: + self.print_comment("Solver did not find any solution within the time/memory limit") + self.print_status(ExitStatus.unknown) + + def handle_memory_error(self, mem_limit): + super().handle_memory_error(mem_limit) + self.print_status(ExitStatus.unknown) + + def handle_not_implemented(self, e): + super().handle_not_implemented(e) + self.print_status(ExitStatus.unsupported) + + def handle_exception(self, e): + if isinstance(e, ParseError): + if "out of memory" in e.msg: + self.print_comment(f"MemoryError raised by parser.") + self.print_status(ExitStatus.unknown) + else: + self.print_comment(f"An {type(e)} got raised by the parser: {e}") + self.print_status(ExitStatus.unknown) + else: + super().handle_exception(e) + self.print_status(ExitStatus.unknown) + + def parse_output_line(self, line, result): + if line.startswith('s '): + result['status'] = line[2:].strip() + elif line.startswith('v ') and result['solution'] is None: + # only record first line, contains 'type' and 'cost' + solution = line.split("\n")[0][2:].strip() + result['solution'] = str(solution) + complete_solution = line + if "cost" in solution: + result['objective_value'] = solution.split('cost="')[-1][:-2] + elif line.startswith('o '): + obj = int(line[2:].strip()) + if result['intermediate'] is None: + result['intermediate'] = [] + result['intermediate'] += [(sol_time, obj)] + result['objective_value'] = obj + obj = None + elif line.startswith('c Solution'): + parts = line.split(', time = ') + # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines + sol_time = float(parts[-1].replace('s', '').rstrip()) + elif line.startswith('c took '): + # Parse timing information + parts = line.split(' seconds to ') + if len(parts) == 2: + time_val = float(parts[0].replace('c took ', '')) + action = parts[1].strip() + if action.startswith('parse'): + result['time_parse'] = time_val + elif action.startswith('convert'): + result['time_model'] = time_val + elif action.startswith('post'): + result['time_post'] = time_val + elif action.startswith('solve'): + result['time_solve'] = time_val + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Benchmark solvers on XCSP3 instances') + parser.add_argument('--year', type=int, required=True, help='Competition year (e.g., 2023)') + parser.add_argument('--track', type=str, required=True, help='Track type (e.g., COP, CSP, MiniCOP)') + parser.add_argument('--solver', type=str, required=True, help='Solver name (e.g., ortools, exact, choco, ...)') + parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers') + parser.add_argument('--time-limit', type=int, default=300, help='Time limit in seconds per instance') + parser.add_argument('--mem-limit', type=int, default=8192, help='Memory limit in MB per instance') + parser.add_argument('--cores', type=int, default=1, help='Number of cores to assign tp a single instance') + parser.add_argument('--output-dir', type=str, default='results', help='Output directory for CSV files') + parser.add_argument('--verbose', action='store_true', help='Show solver output') + parser.add_argument('--intermediate', action='store_true', help='Report on intermediate solutions') + parser.add_argument('--checker-path', type=str, default=None, + help='Path to the XCSP3 solution checker JAR file') + args = parser.parse_args() + + if not args.verbose: + warnings.filterwarnings("ignore") + + # Load benchmark instances (as a dataset) + from cpmpy.tools.dataset.model.xcsp3 import XCSP3Dataset + dataset = XCSP3Dataset(year=args.year, track=args.track, download=True) + + # Create output directory + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Get current timestamp in a filename-safe format + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Define output file path with timestamp + output_file = str(output_dir / "xcsp3" / f"xcsp3_{args.year}_{args.track}_{args.solver}_{timestamp}.csv") + + # Run the benchmark + instance_runner = XCSP3Benchmark() + output_file = benchmark_runner(dataset=dataset, instance_runner=instance_runner, output_file=output_file, **vars(args)) + print(f"Results added to {output_file}") + + diff --git a/cpmpy/tools/dataset/model/xcsp3.py b/cpmpy/tools/dataset/model/xcsp3.py new file mode 100644 index 000000000..e71df1d04 --- /dev/null +++ b/cpmpy/tools/dataset/model/xcsp3.py @@ -0,0 +1,132 @@ +""" +XCS3 Dataset + +https://xcsp.org/instances/ +""" + +from functools import partial +import os +import lzma +import zipfile +import pathlib +from urllib.request import urlretrieve +from urllib.error import HTTPError, URLError + +from cpmpy.tools.dataset._base import _Dataset + + +class XCSP3Dataset(_Dataset): + """ + XCSP3 benchmark dataset. + + Provides access to benchmark instances from the XCSP3 + competitions. Instances are grouped by `year` and `track` (e.g., + `"CSP"`, `"eCOP"`) and stored as `.xml.lzma` files. + If the dataset is not available locally, it can be automatically + downloaded and extracted. + + More information on the competition can be found here: https://xcsp.org/competitions/ + """ + + def __init__( + self, + root: str = ".", + year: int = 2023, track: str = "CSP", + transform=None, target_transform=None, + download: bool = False + ): + """ + Constructor for a dataset object of the XCP3 competition. + + Arguments: + root (str): Root directory where datasets are stored or will be downloaded to (default="."). + year (int): Competition year of the dataset to use (default=2024). + track (str): Track name specifying which subset of the competition instances to load (default="CSP"). + transform (callable, optional): Optional transform applied to the instance file path. + target_transform (callable, optional): Optional transform applied to the metadata dictionary. + download (bool): If True, downloads the dataset if it does not exist locally (default=False). + + + Raises: + ValueError: If the dataset directory does not exist and `download=False`, + or if the requested year/track combination is not available. + """ + + self.root = pathlib.Path(root) + self.year = year + self.track = track + + dataset_dir = self.root / str(year) / track + + super().__init__( + dataset_dir=dataset_dir, + transform=transform, target_transform=target_transform, + download=download, extension=".xml.lzma" + ) + + + def category(self): + return { + "year": self.year, + "track": self.track + } + + def download(self): + print(f"Downloading XCSP3 {self.year} instances...") + + url = f"https://www.cril.univ-artois.fr/~lecoutre/compets/" + year_suffix = str(self.year)[2:] # Drop the starting '20' + url_path = url + f"instancesXCSP{year_suffix}.zip" + zip_path = self.root / f"instancesXCSP{year_suffix}.zip" + + try: + urlretrieve(url_path, str(zip_path)) + except (HTTPError, URLError) as e: + raise ValueError(f"No dataset available for year {self.year}. Error: {str(e)}") + + # Extract only the specific track folder from the zip + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + # Get the main folder name (e.g., "024_V3") + main_folder = None + for name in zip_ref.namelist(): + if '/' in name: + main_folder = name.split('/')[0] + break + + if main_folder is None: + raise ValueError(f"Could not find main folder in zip file") + + # Extract only files from the specified track + # Get all unique track names from zip + tracks = set() + for file_info in zip_ref.infolist(): + parts = file_info.filename.split('/') + if len(parts) > 2 and parts[0] == main_folder: + tracks.add(parts[1]) + + # Check if requested track exists + if self.track not in tracks: + raise ValueError(f"Track '{self.track}' not found in dataset. Available tracks: {sorted(tracks)}") + + # Create track folder in root directory, parents=True ensures recursive creation + self.dataset_dir.mkdir(parents=True, exist_ok=True) + + # Extract files for the specified track + prefix = f"{main_folder}/{self.track}/" + for file_info in zip_ref.infolist(): + if file_info.filename.startswith(prefix): + # Extract file to track_dir, removing main_folder/track prefix + filename = pathlib.Path(file_info.filename).name + with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: + target.write(source.read()) + # Clean up the zip file + zip_path.unlink() + + def open(self, instance: os.PathLike) -> callable: + return partial(lzma.open, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open + + +if __name__ == "__main__": + dataset = XCSP3Dataset(year=2024, track="MiniCOP", download=True) + print("Dataset size:", len(dataset)) + print("Instance 0:", dataset[0]) diff --git a/cpmpy/tools/xcsp3/__init__.py b/cpmpy/tools/xcsp3/__init__.py index d5abf2766..9572943d8 100644 --- a/cpmpy/tools/xcsp3/__init__.py +++ b/cpmpy/tools/xcsp3/__init__.py @@ -4,127 +4,24 @@ ## __init__.py ## """ - Set of utilities for working with XCSP3-formatted CP models. - - - ================= - List of functions - ================= - - .. autosummary:: - :nosignatures: - - read_xcsp3 - - ======================== - List of helper functions - ======================== - - .. autosummary:: - :nosignatures: - - _parse_xcsp3 - _load_xcsp3 - - ================== - List of submodules - ================== - - .. autosummary:: - :nosignatures: - - parser_callbacks - analyze - benchmark - xcsp3_cpmpy - dataset - globals +Set of utilities for working with XCSP3-formatted CP models. + +================== +List of submodules +================== + +.. autosummary:: + :nosignatures: + + parser + parser_callbacks + analyze + benchmark + xcsp3_cpmpy + dataset + globals """ -from io import StringIO -import lzma -import os -import cpmpy as cp - -# Special case for optional cpmpy dependencies -from typing import TYPE_CHECKING -if TYPE_CHECKING: - from pycsp3.parser.xparser import CallbackerXCSP3, ParserXCSP3 from .dataset import XCSP3Dataset # for easier importing - -def _parse_xcsp3(path: os.PathLike) -> "ParserXCSP3": - """ - Parses an XCSP3 instance file (.xml) and returns a `ParserXCSP3` instance. - - Arguments: - path: location of the XCSP3 instance to read (expects a .xml file). - - Returns: - A parser object. - """ - try: - from pycsp3.parser.xparser import ParserXCSP3 - except ImportError as e: - raise ImportError("The 'pycsp3' package is required to parse XCSP3 files. " - "Please install it with `pip install pycsp3`.") from e - - parser = ParserXCSP3(path) - return parser - -def _load_xcsp3(parser: "ParserXCSP3") -> cp.Model: - """ - Takes in a `ParserXCSP3` instance and loads its captured model as a CPMpy model. - - Arguments: - parser (ParserXCSP3): A parser object to load from. - - Returns: - The XCSP3 instance loaded as a CPMpy model. - """ - from .parser_callbacks import CallbacksCPMPy - from pycsp3.parser.xparser import CallbackerXCSP3 - callbacks = CallbacksCPMPy() - callbacks.force_exit = True - callbacker = CallbackerXCSP3(parser, callbacks) - callbacker.load_instance() - model = callbacks.cpm_model - - return model - - -def read_xcsp3(path: os.PathLike) -> cp.Model: - """ - Reads in an XCSP3 instance (.xml or .xml.lzma) and returns its matching CPMpy model. - - Arguments: - path: location of the XCSP3 instance to read (expects a .xml or .xml.lzma file). - - Returns: - The XCSP3 instance loaded as a CPMpy model. - """ - # Decompress on the fly if still in .lzma format - if str(path).endswith(".lzma"): - path = decompress_lzma(path) - - # Parse and create CPMpy model - parser = _parse_xcsp3(path) - model = _load_xcsp3(parser) - return model - -def decompress_lzma(path: os.PathLike) -> StringIO: - """ - Decompresses a .lzma file. - - Arguments: - path: Location of .lzma file - - Returns: - Memory-mapped decompressed file - """ - # Decompress the XZ file - with lzma.open(path, 'rt', encoding='utf-8') as f: - return StringIO(f.read()) # read to memory-mapped file - - - \ No newline at end of file +from .parser import read_xcsp3 \ No newline at end of file diff --git a/cpmpy/tools/xcsp3/parser.py b/cpmpy/tools/xcsp3/parser.py new file mode 100644 index 000000000..761ef7caa --- /dev/null +++ b/cpmpy/tools/xcsp3/parser.py @@ -0,0 +1,146 @@ +""" +Parser for the XCSP3 format. + + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read_xcsp3 + +======================== +List of helper functions +======================== + +.. autosummary:: + :nosignatures: + + _parse_xcsp3 + _load_xcsp3 +""" + +import os +import sys +import argparse +from io import StringIO + +import cpmpy as cp + +# Special case for optional cpmpy dependencies +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pycsp3.parser.xparser import ParserXCSP3 + +def _parse_xcsp3(path: os.PathLike) -> "ParserXCSP3": + """ + Parses an XCSP3 instance file (.xml) and returns a `ParserXCSP3` instance. + + Arguments: + path: location of the XCSP3 instance to read (expects a .xml file). + + Returns: + A parser object. + """ + try: + from pycsp3.parser.xparser import ParserXCSP3 + except ImportError as e: + raise ImportError("The 'pycsp3' package is required to parse XCSP3 files. " + "Please install it with `pip install pycsp3`.") from e + + parser = ParserXCSP3(path) + return parser + +def _load_xcsp3(parser: "ParserXCSP3") -> cp.Model: + """ + Takes in a `ParserXCSP3` instance and loads its captured model as a CPMpy model. + + Arguments: + parser (ParserXCSP3): A parser object to load from. + + Returns: + The XCSP3 instance loaded as a CPMpy model. + """ + from .parser_callbacks import CallbacksCPMPy + from pycsp3.parser.xparser import CallbackerXCSP3 + callbacks = CallbacksCPMPy() + callbacks.force_exit = True + callbacker = CallbackerXCSP3(parser, callbacks) + callbacker.load_instance() + model = callbacks.cpm_model + + return model + +_std_open = open +def read_xcsp3(xcsp3: os.PathLike, open=open) -> cp.Model: + """ + Reads in an XCSP3 instance (.xml or .xml.lzma) and returns its matching CPMpy model. + + Arguments: + xcsp3 (str or os.PathLike): + - A file path to an WCNF file (optionally LZMA-compressed with `.lzma`) + - OR a string containing the WCNF content directly + open: (callable): + If wcnf is the path to a file, a callable to "open" that file (default=python standard library's 'open'). + + Returns: + The XCSP3 instance loaded as a CPMpy model. + """ + # If wcnf is a path to a file -> open file + if isinstance(xcsp3, (str, os.PathLike)) and os.path.exists(xcsp3): + if open is not None: + f = open(xcsp3) + else: + f = _std_open(xcsp3, "rt") + # If wcnf is a string containing a model -> create a memory-mapped file + else: + f = StringIO(xcsp3) + + # Parse and create CPMpy model + parser = _parse_xcsp3(f) + model = _load_xcsp3(parser) + return model + + +def main(): + parser = argparse.ArgumentParser(description="Parse and solve a WCNF model using CPMpy") + parser.add_argument("model", help="Path to a WCNF file (or raw WCNF string if --string is given)") + parser.add_argument("-s", "--solver", default=None, help="Solver name to use (default: CPMpy's default)") + parser.add_argument("--string", action="store_true", help="Interpret the first argument (model) as a raw WCNF string instead of a file path") + parser.add_argument("-t", "--time-limit", type=int, default=None, help="Time limit for the solver in seconds (default: no limit)") + args = parser.parse_args() + + # Build the CPMpy model + try: + if args.string: + model = read_xcsp3(args.model) + else: + model = read_xcsp3(os.path.expanduser(args.model)) + except Exception as e: + sys.stderr.write(f"Error reading model: {e}\n") + sys.exit(1) + + # Solve the model + try: + if args.solver: + result = model.solve(solver=args.solver, time_limit=args.time_limit) + else: + result = model.solve(time_limit=args.time_limit) + except Exception as e: + sys.stderr.write(f"Error solving model: {e}\n") + sys.exit(1) + + # Print results + print("Status:", model.status()) + if result is not None: + if model.has_objective(): + print("Objective:", model.objective_value()) + else: + print("No solution found.") + +if __name__ == "__main__": + main() + \ No newline at end of file From 9173c9faebbb4077368f6ba73c3990804c055fe0 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 14:07:22 +0200 Subject: [PATCH 012/152] Parsers with changeable 'open' --- cpmpy/tools/opb/parser.py | 11 ++++++++--- cpmpy/tools/wcnf/parser.py | 17 ++++++++--------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/cpmpy/tools/opb/parser.py b/cpmpy/tools/opb/parser.py index 846c0874b..e300a2752 100644 --- a/cpmpy/tools/opb/parser.py +++ b/cpmpy/tools/opb/parser.py @@ -105,7 +105,8 @@ def _parse_constraint(line, vars): right=rhs ) -def read_opb(opb: Union[str, os.PathLike]) -> cp.Model: +_std_open = open +def read_opb(opb: Union[str, os.PathLike], open=open) -> cp.Model: """ Parser for OPB (Pseudo-Boolean) format. Reads in an instance and returns its matching CPMpy model. @@ -121,6 +122,8 @@ def read_opb(opb: Union[str, os.PathLike]) -> cp.Model: opb (str or os.PathLike): - A file path to an OPB file (optionally LZMA-compressed with `.xz`) - OR a string containing the OPB content directly + open: (callable): + If wcnf is the path to a file, a callable to "open" that file (default=python standard library's 'open'). Returns: cp.Model: The CPMpy model of the OPB instance. @@ -143,8 +146,10 @@ def read_opb(opb: Union[str, os.PathLike]) -> cp.Model: # If opb is a path to a file -> open file if isinstance(opb, (str, os.PathLike)) and os.path.exists(opb): - f_open = lzma.open if str(opb).endswith(".xz") else open - f = f_open(opb, 'rt') + if open is not None: + f = open(opb) + else: + f = _std_open(opb, "rt") # If opb is a string containing a model -> create a memory-mapped file else: f = StringIO(opb) diff --git a/cpmpy/tools/wcnf/parser.py b/cpmpy/tools/wcnf/parser.py index 72cec94c8..84b484979 100644 --- a/cpmpy/tools/wcnf/parser.py +++ b/cpmpy/tools/wcnf/parser.py @@ -1,8 +1,3 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## __init__.py -## """ Parser for the WCNF format. @@ -39,8 +34,8 @@ def _get_var(i, vars_dict): vars_dict[i] = cp.boolvar(name=f"x{i}") # <- be carefull that name doesn't clash with generated variables during transformations / user variables return vars_dict[i] - -def read_wcnf(wcnf: Union[str, os.PathLike]) -> cp.Model: +_std_open = open +def read_wcnf(wcnf: Union[str, os.PathLike], open=open) -> cp.Model: """ Parser for WCNF format. Reads in an instance and returns its matching CPMpy model. @@ -48,14 +43,18 @@ def read_wcnf(wcnf: Union[str, os.PathLike]) -> cp.Model: wcnf (str or os.PathLike): - A file path to an WCNF file (optionally LZMA-compressed with `.xz`) - OR a string containing the WCNF content directly + open: (callable): + If wcnf is the path to a file, a callable to "open" that file (default=python standard library's 'open'). Returns: cp.Model: The CPMpy model of the WCNF instance. """ # If wcnf is a path to a file -> open file if isinstance(wcnf, (str, os.PathLike)) and os.path.exists(wcnf): - f_open = lzma.open if str(wcnf).endswith(".xz") else open - f = f_open(wcnf, "rt") + if open is not None: + f = open(wcnf) + else: + f = _std_open(wcnf, "rt") # If wcnf is a string containing a model -> create a memory-mapped file else: f = StringIO(wcnf) From 52b95de5f6e556e0d853b4c5f85c08a13f93087b Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 14:13:58 +0200 Subject: [PATCH 013/152] Type-hints and docstrings --- cpmpy/tools/dataset/_base.py | 20 +++++++++++++++++--- cpmpy/tools/dataset/model/mse.py | 2 +- cpmpy/tools/dataset/model/opb.py | 6 ++++-- cpmpy/tools/dataset/model/xcsp3.py | 2 +- 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index ce2206110..aa22ae930 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -38,18 +38,32 @@ def __init__( self.download() @abstractmethod - def category(self): + def category(self) -> dict: + """ + Labels to distinguish instances into categories matching to those of the dataset. + E.g. + - year + - track + """ pass @abstractmethod def download(self, *args, **kwargs): + """ + How the dataset should be downloaded. + """ pass @abstractmethod - def open(self, instance): + def open(self, instance) -> callable: + """ + How an instance file from the dataset should be opened. + Especially usefull when files come compressed and won't work with + python standard library's 'open', e.g. '.xz', '.lzma'. + """ pass - def metadata(self, file): + def metadata(self, file) -> dict: metadata = self.category() | { 'name': pathlib.Path(file).stem.replace(self.extension, ''), 'path': file, diff --git a/cpmpy/tools/dataset/model/mse.py b/cpmpy/tools/dataset/model/mse.py index 711a560bb..8f395d677 100644 --- a/cpmpy/tools/dataset/model/mse.py +++ b/cpmpy/tools/dataset/model/mse.py @@ -64,7 +64,7 @@ def __init__( ) - def category(self): + def category(self) -> dict: return { "year": self.year, "track": self.track diff --git a/cpmpy/tools/dataset/model/opb.py b/cpmpy/tools/dataset/model/opb.py index bc051d784..40e6a282d 100644 --- a/cpmpy/tools/dataset/model/opb.py +++ b/cpmpy/tools/dataset/model/opb.py @@ -63,19 +63,21 @@ def __init__( download=download, extension=".opb.xz" ) - def category(self): + def category(self) -> dict: return { "year": self.year, "track": self.track } - def metadata(self, file): + def metadata(self, file) -> dict: + # Add the author to the metadata return super().metadata(file) | {'author': str(file).split(os.sep)[-1].split("_")[0],} def download(self): # TODO: add option to filter on competition instances print(f"Downloading OPB {self.year} {self.track} instances...") + url = f"https://www.cril.univ-artois.fr/PB24/benchs/" year_suffix = str(self.year)[2:] # Drop the starting '20' url_path = url + f"normalized-PB{year_suffix}.tar" diff --git a/cpmpy/tools/dataset/model/xcsp3.py b/cpmpy/tools/dataset/model/xcsp3.py index e71df1d04..597a2af55 100644 --- a/cpmpy/tools/dataset/model/xcsp3.py +++ b/cpmpy/tools/dataset/model/xcsp3.py @@ -65,7 +65,7 @@ def __init__( ) - def category(self): + def category(self) -> dict: return { "year": self.year, "track": self.track From bf5ecd2c65b537c10ad67a368da0b6631b7c2a58 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys Date: Fri, 12 Sep 2025 14:18:10 +0200 Subject: [PATCH 014/152] Add TODOs --- cpmpy/tools/benchmark/mse.py | 2 +- cpmpy/tools/benchmark/opb.py | 2 +- cpmpy/tools/benchmark/xcsp3.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpmpy/tools/benchmark/mse.py b/cpmpy/tools/benchmark/mse.py index 3654c2bc8..a11b1f5cb 100644 --- a/cpmpy/tools/benchmark/mse.py +++ b/cpmpy/tools/benchmark/mse.py @@ -73,7 +73,7 @@ def solution_mse(model): Returns: str: MSE-formatted solution string. """ - variables = [var for var in model.user_vars if var.name[:2] == "BV"] # dirty workaround for all missed aux vars in user vars + variables = [var for var in model.user_vars if var.name[:2] == "BV"] # dirty workaround for all missed aux vars in user vars TODO fix with Ignace variables = sorted(variables, key=lambda v: int("".join(filter(str.isdigit, v.name)))) return " ".join([str(1 if var.value() else 0) for var in variables]) diff --git a/cpmpy/tools/benchmark/opb.py b/cpmpy/tools/benchmark/opb.py index 3fc5202cd..9d669a075 100644 --- a/cpmpy/tools/benchmark/opb.py +++ b/cpmpy/tools/benchmark/opb.py @@ -73,7 +73,7 @@ def solution_opb(model): Returns: Formatted model solution according to PB24 specification. """ - variables = [var for var in model.user_vars if var.name[:2] not in ["IV", "BV", "B#"]] # dirty workaround for all missed aux vars in user vars TODO + variables = [var for var in model.user_vars if var.name[:2] not in ["IV", "BV", "B#"]] # dirty workaround for all missed aux vars in user vars TODO fix with Ignace return " ".join([var.name.replace("[","").replace("]","") if var.value() else "-"+var.name.replace("[","").replace("]","") for var in variables]) class OPBBenchmark(Benchmark): diff --git a/cpmpy/tools/benchmark/xcsp3.py b/cpmpy/tools/benchmark/xcsp3.py index e52e41a4a..9601a4530 100644 --- a/cpmpy/tools/benchmark/xcsp3.py +++ b/cpmpy/tools/benchmark/xcsp3.py @@ -92,7 +92,7 @@ def solution_xcsp3(model, useless_style="*", boolean_style="int"): # How useless variables should be handled # (variables which have value `None` in the solution) - variables = {var.name: var for var in model.user_vars if var.name[:2] not in ["IV", "BV", "B#"]} # dirty workaround for all missed aux vars in user vars + variables = {var.name: var for var in model.user_vars if var.name[:2] not in ["IV", "BV", "B#"]} # dirty workaround for all missed aux vars in user vars TODO fix with Ignace if useless_style == "*": variables = {k:(v.value() if v.value() is not None else "*") for k,v in variables.items()} elif useless_style == "drop": From 5dc388647c8c7fde2a3ec270f01c5506eabf02f7 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 15:07:56 +0200 Subject: [PATCH 015/152] Mising helper functions --- cpmpy/tools/benchmark/__init__.py | 59 +++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/cpmpy/tools/benchmark/__init__.py b/cpmpy/tools/benchmark/__init__.py index e69de29bb..b5a26f62a 100644 --- a/cpmpy/tools/benchmark/__init__.py +++ b/cpmpy/tools/benchmark/__init__.py @@ -0,0 +1,59 @@ + +import resource +import sys +import time +import warnings +import psutil + + +TIME_BUFFER = 5 # seconds +# TODO : see if good value +MEMORY_BUFFER_SOFT = 2 # MiB +MEMORY_BUFFER_HARD = 0 # MiB +MEMORY_BUFFER_SOLVER = 20 # MB + + +def set_memory_limit(mem_limit): + """ + Set memory limit (Virtual Memory Size). + """ + if mem_limit is not None: + soft = max(_mib_as_bytes(mem_limit) - _mib_as_bytes(MEMORY_BUFFER_SOFT), _mib_as_bytes(MEMORY_BUFFER_SOFT)) + hard = max(_mib_as_bytes(mem_limit) - _mib_as_bytes(MEMORY_BUFFER_HARD), _mib_as_bytes(MEMORY_BUFFER_HARD)) + if sys.platform != "win32": + resource.setrlimit(resource.RLIMIT_AS, (soft, hard)) # limit memory in number of bytes + else: + warnings.warn("Memory limits using `resource` are not supported on Windows. Skipping hard limit.") + +def set_time_limit(time_limit, verbose:bool=False): + """ + Set time limit (CPU time in seconds). + """ + if time_limit is not None: + if sys.platform != "win32": + soft = time_limit + hard = resource.RLIM_INFINITY + resource.setrlimit(resource.RLIMIT_CPU, (soft, hard)) + else: + warnings.warn("CPU time limits using `resource` are not supported on Windows. Skipping hard limit.") + +def _wall_time(p: psutil.Process): + return time.time() - p.create_time() + +def _mib_as_bytes(mib: int) -> int: + return mib * 1024 * 1024 + +def _mb_as_bytes(mb: int) -> int: + return mb * 1000 * 1000 + +def _bytes_as_mb(bytes: int) -> int: + return bytes // (1000 * 1000) + +def _bytes_as_gb(bytes: int) -> int: + return bytes // (1000 * 1000 * 1000) + +def _bytes_as_mb_float(bytes: int) -> float: + return bytes / (1000 * 1000) + +def _bytes_as_gb_float(bytes: int) -> float: + return bytes / (1000 * 1000 * 1000) \ No newline at end of file From 7209c620d6e22660ea98d5074143ef40bb06e16d Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 15:10:58 +0200 Subject: [PATCH 016/152] Print stacktrace of process --- cpmpy/tools/benchmark/runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpmpy/tools/benchmark/runner.py b/cpmpy/tools/benchmark/runner.py index 325ac54cd..a83740459 100644 --- a/cpmpy/tools/benchmark/runner.py +++ b/cpmpy/tools/benchmark/runner.py @@ -273,15 +273,16 @@ def benchmark_runner( with ThreadPoolExecutor(max_workers=workers) as executor: # Submit all tasks and track their futures futures = [executor.submit(execute_instance, # below: args - (instance_runner, filename, metadata, dataset.open(), solver, time_limit, mem_limit, cores, output_file, verbose, intermediate, checker_path)) + (instance_runner, filename, metadata, dataset.open, solver, time_limit, mem_limit, cores, output_file, verbose, intermediate, checker_path)) for filename, metadata in dataset] # Process results as they complete - for i,future in enumerate(tqdm(futures, total=len(futures), desc=f"Running {solver}")): + for i, future in enumerate(tqdm(futures, total=len(futures), desc=f"Running {solver}")): try: - _ = future.result(timeout=time_limit+60) # for cleanliness sake, result is empty + _ = future.result(timeout=time_limit + 60) # for cleanliness sake, result is empty except TimeoutError: pass except Exception as e: print(f"Job {i}: {dataset[i][1]['name']}, ProcessPoolExecutor caught: {e}") + if verbose: traceback.print_exc() return output_file From f66c8c554555d178244dacd50ed487d9f3c22401 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 15:21:22 +0200 Subject: [PATCH 017/152] Fix arguments --- cpmpy/tools/benchmark/_base.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpmpy/tools/benchmark/_base.py b/cpmpy/tools/benchmark/_base.py index 85119a822..c6b4353d9 100644 --- a/cpmpy/tools/benchmark/_base.py +++ b/cpmpy/tools/benchmark/_base.py @@ -56,14 +56,14 @@ def __init__(self, reader:callable): """ self.reader = reader - def read_instance(self, instance) -> cp.Model: + def read_instance(self, instance, open) -> cp.Model: """ Parse a model instance to a CPMpy model. Arguments: instance (str or os.PathLike): The model instance to parse into a CPMpy model. """ - return self.reader(instance) + return self.reader(instance, open=open) """ Callback methods which can be overwritten to make a custom benchmark run. @@ -326,13 +326,13 @@ def solution_count(self): Methods which can, bit most likely shouldn't, be overwritten. """ - def set_memory_limit(self, mem_limit, verbose=False): - set_memory_limit(mem_limit, verbose=verbose) + def set_memory_limit(self, mem_limit): + set_memory_limit(mem_limit) - def set_time_limit(self, time_limit, verbose=False): + def set_time_limit(self, time_limit): p = psutil.Process() if time_limit is not None: - set_time_limit(int(time_limit - _wall_time(p) + time.process_time()), verbose=verbose) + set_time_limit(int(time_limit - _wall_time(p) + time.process_time())) else: set_time_limit(None) @@ -419,11 +419,11 @@ def run( # Set memory limit (if provided) if mem_limit is not None: - self.set_memory_limit(mem_limit, verbose=verbose) + self.set_memory_limit(mem_limit) # Set time limit (if provided) if time_limit is not None: - self.set_time_limit(time_limit, verbose=verbose) # set remaining process time != wall time + self.set_time_limit(time_limit) # set remaining process time != wall time # ------------------------------ Parse instance ------------------------------ # From 6ab8b32932da152140bd94e168757fa9e4027ad5 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 15:21:48 +0200 Subject: [PATCH 018/152] Fix overwritten open --- cpmpy/tools/benchmark/runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/benchmark/runner.py b/cpmpy/tools/benchmark/runner.py index a83740459..9d3f4c5f6 100644 --- a/cpmpy/tools/benchmark/runner.py +++ b/cpmpy/tools/benchmark/runner.py @@ -107,6 +107,7 @@ def wrapper(instance_runner, conn, kwargs, verbose): conn.close() # exec_args = (instance_runner, filename, metadata, open, solver, time_limit, mem_limit, output_file, verbose) +_std_open = open def execute_instance(args: Tuple[callable, str, dict, callable, str, int, int, int, str, bool, bool, str]) -> None: """ Solve a single benchmark instance and write results to file immediately. @@ -224,7 +225,7 @@ def execute_instance(args: Tuple[callable, str, dict, callable, str, int, int, i # Pre-check if file exists to determine if we need to write header write_header = not os.path.exists(output_file) - with open(output_file, 'a', newline='') as f: + with _std_open(output_file, 'a', newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) if write_header: writer.writeheader() From 34c8a9e75828022003afdbc056068eee14f7078e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 15:22:12 +0200 Subject: [PATCH 019/152] Read as string instead of StringIO --- cpmpy/tools/benchmark/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpmpy/tools/benchmark/runner.py b/cpmpy/tools/benchmark/runner.py index 9d3f4c5f6..b0edeb655 100644 --- a/cpmpy/tools/benchmark/runner.py +++ b/cpmpy/tools/benchmark/runner.py @@ -136,7 +136,7 @@ def execute_instance(args: Tuple[callable, str, dict, callable, str, int, int, i # Decompress before timers start with open(filename) as f: # <- dataset-specific 'open' callable - filename = StringIO(f.read()) # read to memory-mapped file + filename = f.read() # read to memory-mapped file # Start total timing total_start = time.time() From fd55b3a204e4fd8578d88f4316bf7ff49eb74702 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 15:22:32 +0200 Subject: [PATCH 020/152] Read as text instead of binary --- cpmpy/tools/dataset/model/mse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpmpy/tools/dataset/model/mse.py b/cpmpy/tools/dataset/model/mse.py index 8f395d677..ef31b0d64 100644 --- a/cpmpy/tools/dataset/model/mse.py +++ b/cpmpy/tools/dataset/model/mse.py @@ -100,7 +100,7 @@ def download(self): zip_path.unlink() def open(self, instance: os.PathLike) -> callable: - return lzma.open if str(instance).endswith(".xz") else open + return lzma.open(instance, "rt") if str(instance).endswith(".xz") else open(instance) if __name__ == "__main__": dataset = MSEDataset(year=2024, track="exact-weighted", download=True) From 2be9fa67820ece6aa1044822a5d88b618faa2f40 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 17:11:27 +0200 Subject: [PATCH 021/152] Sigterm callbacks --- cpmpy/tools/benchmark/_base.py | 47 ++++++++++++++++++++++++++++++++- cpmpy/tools/benchmark/mse.py | 39 ++++++++++++++++++++------- cpmpy/tools/benchmark/runner.py | 30 ++++++++++++--------- 3 files changed, 93 insertions(+), 23 deletions(-) diff --git a/cpmpy/tools/benchmark/_base.py b/cpmpy/tools/benchmark/_base.py index c6b4353d9..3522af9e9 100644 --- a/cpmpy/tools/benchmark/_base.py +++ b/cpmpy/tools/benchmark/_base.py @@ -30,15 +30,25 @@ from abc import ABC +import os +import signal +import sys import time import random import psutil import warnings +from enum import Enum from typing import Optional import cpmpy as cp from cpmpy.tools.benchmark import _mib_as_bytes, _wall_time, set_memory_limit, set_time_limit, _bytes_as_mb, _bytes_as_gb +class ExitStatus(Enum): + unsupported:str = "unsupported" # instance contains an unsupported feature (e.g. a unsupported global constraint) + sat:str = "sat" # CSP : found a solution | COP : found a solution but couldn't prove optimality + optimal:str = "optimal" # optimal COP solution found + unsat:str = "unsat" # instance is unsatisfiable + unknown:str = "unknown" # any other case class Benchmark(ABC): """ @@ -49,12 +59,13 @@ class Benchmark(ABC): It is designed to be extended or customized for specific benchmarking needs. """ - def __init__(self, reader:callable): + def __init__(self, reader:callable, exit_status:Enum): """ Arguments: reader (callable): A parser from a model format to a CPMPy model. """ self.reader = reader + self.exit_status = exit_status def read_instance(self, instance, open) -> cp.Model: """ @@ -92,6 +103,12 @@ def handle_exception(self, e): if line.strip(): self.print_comment(line) + def handle_sigterm(self): + pass + + def handle_rlimit_cpu(self): + pass + """ Solver arguments (can also be tweaked for a specific benchmark). """ @@ -336,6 +353,29 @@ def set_time_limit(self, time_limit): else: set_time_limit(None) + def sigterm_handler(self, _signo, _stack_frame): + exit_code = self.handle_sigterm() + print(flush=True) + os._exit(exit_code) + + def rlimit_cpu_handler(self, _signo, _stack_frame): + exit_code = self.handle_rlimit_cpu() + print(flush=True) + os._exit(exit_code) + + def init_signal_handlers(self): + """ + Configure signal handlers + """ + signal.signal(signal.SIGINT, self.sigterm_handler) + signal.signal(signal.SIGTERM, self.sigterm_handler) + signal.signal(signal.SIGINT, self.sigterm_handler) + signal.signal(signal.SIGABRT, self.sigterm_handler) + if sys.platform != "win32": + signal.signal(signal.SIGXCPU, self.rlimit_cpu_handler) + else: + warnings.warn("Windows does not support setting SIGXCPU signal") + def post_model(self, model, solver, solver_args): """ Post the model to the selected backend solver. @@ -417,6 +457,8 @@ def run( if seed is not None: random.seed(seed) + self.init_signal_handlers() + # Set memory limit (if provided) if mem_limit is not None: self.set_memory_limit(mem_limit) @@ -488,6 +530,9 @@ def run( except NotImplementedError as e: self.handle_not_implemented(e) raise e + except TimeoutError as e: + self.handle_exception(e) # TODO add callback for timeout? + raise e except Exception as e: self.handle_exception(e) raise e diff --git a/cpmpy/tools/benchmark/mse.py b/cpmpy/tools/benchmark/mse.py index a11b1f5cb..3745e4503 100644 --- a/cpmpy/tools/benchmark/mse.py +++ b/cpmpy/tools/benchmark/mse.py @@ -56,7 +56,7 @@ from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus -class ExitStatus(Enum): +class MSEExitStatus(Enum): unsupported:str = "UNSUPPORTED" # instance contains an unsupported feature (e.g. a unsupported global constraint) sat:str = "SATISFIABLE" # CSP : found a solution | COP : found a solution but couldn't prove optimality optimal:str = "OPTIMUM" + chr(32) + "FOUND" # optimal COP solution found @@ -89,12 +89,12 @@ class MSEBenchmark(Benchmark): """ def __init__(self): - super().__init__(reader=read_wcnf) + super().__init__(reader=read_wcnf, exit_status=MSEExitStatus) def print_comment(self, comment:str): print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) - def print_status(self, status: ExitStatus) -> None: + def print_status(self, status: MSEExitStatus) -> None: print('s' + chr(32) + status.value, end="\n", flush=True) def print_value(self, value: str) -> None: @@ -107,27 +107,46 @@ def print_objective(self, objective: int) -> None: def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: self.print_value(solution_mse(s)) - self.print_status(ExitStatus.optimal) + self.print_status(MSEExitStatus.optimal) elif s.status().exitstatus == CPMStatus.FEASIBLE: self.print_value(solution_mse(s)) - self.print_status(ExitStatus.sat) + self.print_status(MSEExitStatus.sat) elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: - self.print_status(ExitStatus.unsat) + self.print_status(MSEExitStatus.unsat) else: self.print_comment("Solver did not find any solution within the time/memory limit") - self.print_status(ExitStatus.unknown) + self.print_status(MSEExitStatus.unknown) def handle_memory_error(self, mem_limit): super().handle_memory_error(mem_limit) - self.print_status(ExitStatus.unknown) + self.print_status(MSEExitStatus.unknown) def handle_not_implemented(self, e): super().handle_not_implemented(e) - self.print_status(ExitStatus.unsupported) + self.print_status(MSEExitStatus.unsupported) def handle_exception(self, e): super().handle_exception(e) - self.print_status(ExitStatus.unknown) + self.print_status(MSEExitStatus.unknown) + + + def handle_sigterm(self): + """ + Handles a SIGTERM. Gives us 1 second to finish the current job before we get killed. + """ + # Report that we haven't found a solution in time + self.print_status(MSEExitStatus.unknown) + self.print_comment("SIGTERM raised.") + return 0 + + def handle_rlimit_cpu(self): + """ + Handles a SIGXCPU. + """ + # Report that we haven't found a solution in time + self.print_status(MSEExitStatus.unknown) + self.print_comment("SIGXCPU raised.") + return 0 def parse_output_line(self, line, result): if line.startswith('s '): diff --git a/cpmpy/tools/benchmark/runner.py b/cpmpy/tools/benchmark/runner.py index b0edeb655..933dac132 100644 --- a/cpmpy/tools/benchmark/runner.py +++ b/cpmpy/tools/benchmark/runner.py @@ -30,8 +30,6 @@ from filelock import FileLock from concurrent.futures import ThreadPoolExecutor -from cpmpy.tools.xcsp3.xcsp3_cpmpy import xcsp3_cpmpy, init_signal_handlers, ExitStatus - class Tee: """ A stream-like object that duplicates writes to multiple underlying streams. @@ -96,14 +94,22 @@ def wrapper(instance_runner, conn, kwargs, verbose): sys.stdout = Tee(original_stdout, pipe_writer) # forward to pipe and console try: - init_signal_handlers() # configure OS signal handlers instance_runner.run(**kwargs) conn.send({"status": "ok"}) + except TimeoutError: + try: + conn.send({"status": "timeout"}) + except (BrokenPipeError, EOFError): + pass except Exception as e: # capture exceptions and report in state tb_str = traceback.format_exc() - conn.send({"status": "error", "exception": e, "traceback": tb_str}) + try: + conn.send({"status": "error", "exception": e, "traceback": tb_str}) + except (BrokenPipeError, EOFError): + pass + #conn.send({"status": "error", "exception": e, "traceback": tb_str}) finally: - sys.stdout = original_stdout + #sys.stdout = original_stdout conn.close() # exec_args = (instance_runner, filename, metadata, open, solver, time_limit, mem_limit, output_file, verbose) @@ -125,7 +131,7 @@ def execute_instance(args: Tuple[callable, str, dict, callable, str, int, int, i instance_runner, filename, metadata, open, solver, time_limit, mem_limit, cores, output_file, verbose, intermediate, checker_path = args # Fieldnames for the CSV file - fieldnames = ['instance'] + list(metadata.keys()) + \ + fieldnames = list(metadata.keys()) + \ ['solver', 'time_total', 'time_parse', 'time_model', 'time_post', 'time_solve', 'status', 'objective_value', 'solution', 'intermediate', 'checker_result'] @@ -194,13 +200,13 @@ def execute_instance(args: Tuple[callable, str, dict, callable, str, int, int, i raise() # Parse the exit status - if status["status"] == "error": + if status["status"] == "timeout": # Ignore timeouts - if "TimeoutError" in repr(status["exception"]): - pass - # All other exceptions, put in solution field - elif result['solution'] is None: - result['status'] = ExitStatus.unknown.value + pass + elif status["status"] == "error": + # All exceptions, put in solution field + if result['solution'] is None: + result['status'] = instance_runner.exit_status.unknown.value result["solution"] = status["exception"] # if checker_path is not None and complete_solution is not None: TODO: generalise 'checkers' for benchmarks From 2e646231405bd2f410fcfdb47192259b5ebad0c5 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 18:18:02 +0200 Subject: [PATCH 022/152] Attempt at fixing some nested memory exceptions --- cpmpy/tools/benchmark/__init__.py | 7 +++++++ cpmpy/tools/benchmark/_base.py | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/benchmark/__init__.py b/cpmpy/tools/benchmark/__init__.py index b5a26f62a..54aa8031f 100644 --- a/cpmpy/tools/benchmark/__init__.py +++ b/cpmpy/tools/benchmark/__init__.py @@ -13,6 +13,7 @@ MEMORY_BUFFER_SOLVER = 20 # MB + def set_memory_limit(mem_limit): """ Set memory limit (Virtual Memory Size). @@ -25,6 +26,12 @@ def set_memory_limit(mem_limit): else: warnings.warn("Memory limits using `resource` are not supported on Windows. Skipping hard limit.") +def disable_memory_limit(): + if sys.platform != "win32": + soft, hard = resource.getrlimit(resource.RLIMIT_AS) + # set a very high soft limit + resource.setrlimit(resource.RLIMIT_AS, (hard, hard)) + def set_time_limit(time_limit, verbose:bool=False): """ Set time limit (CPU time in seconds). diff --git a/cpmpy/tools/benchmark/_base.py b/cpmpy/tools/benchmark/_base.py index 3522af9e9..2e81505e1 100644 --- a/cpmpy/tools/benchmark/_base.py +++ b/cpmpy/tools/benchmark/_base.py @@ -41,7 +41,7 @@ from typing import Optional import cpmpy as cp -from cpmpy.tools.benchmark import _mib_as_bytes, _wall_time, set_memory_limit, set_time_limit, _bytes_as_mb, _bytes_as_gb +from cpmpy.tools.benchmark import _mib_as_bytes, _wall_time, set_memory_limit, set_time_limit, _bytes_as_mb, _bytes_as_gb, disable_memory_limit class ExitStatus(Enum): unsupported:str = "unsupported" # instance contains an unsupported feature (e.g. a unsupported global constraint) @@ -525,6 +525,7 @@ def run( except MemoryError as e: + disable_memory_limit() self.handle_memory_error(mem_limit) raise e except NotImplementedError as e: From 5b926807300e0d196197bcded7685085bd73cf4c Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 18:18:52 +0200 Subject: [PATCH 023/152] Overwritable exit status --- cpmpy/tools/benchmark/mse.py | 1 + cpmpy/tools/benchmark/opb.py | 39 +++++++++++++++++++++++++--------- cpmpy/tools/benchmark/xcsp3.py | 25 +++++++++++----------- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/cpmpy/tools/benchmark/mse.py b/cpmpy/tools/benchmark/mse.py index 3745e4503..b7d645369 100644 --- a/cpmpy/tools/benchmark/mse.py +++ b/cpmpy/tools/benchmark/mse.py @@ -31,6 +31,7 @@ .. autosummary:: :nosignatures: + MSEExitStatus MSEBenchmark ================= diff --git a/cpmpy/tools/benchmark/opb.py b/cpmpy/tools/benchmark/opb.py index 9d669a075..905d7ab0e 100644 --- a/cpmpy/tools/benchmark/opb.py +++ b/cpmpy/tools/benchmark/opb.py @@ -31,6 +31,7 @@ .. autosummary:: :nosignatures: + OPBExitStatus OPBBenchmark ================= @@ -56,7 +57,7 @@ from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus -class ExitStatus(Enum): +class OPBExitStatus(Enum): unsupported:str = "UNSUPPORTED" # instance contains an unsupported feature (e.g. a unsupported global constraint) sat:str = "SATISFIABLE" # CSP : found a solution | COP : found a solution but couldn't prove optimality optimal:str = "OPTIMUM" + chr(32) + "FOUND" # optimal COP solution found @@ -82,12 +83,12 @@ class OPBBenchmark(Benchmark): """ def __init__(self): - super().__init__(reader=read_opb) + super().__init__(reader=read_opb, exit_status=OPBExitStatus) def print_comment(self, comment:str): print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) - def print_status(self, status: ExitStatus) -> None: + def print_status(self, status: OPBExitStatus) -> None: print('s' + chr(32) + status.value, end="\n", flush=True) def print_value(self, value: str) -> None: @@ -101,27 +102,45 @@ def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: self.print_result() self.print_value(solution_opb(s)) - self.print_status(ExitStatus.optimal) + self.print_status(OPBExitStatus.optimal) elif s.status().exitstatus == CPMStatus.FEASIBLE: self.print_value(solution_opb(s)) - self.print_status(ExitStatus.sat) + self.print_status(OPBExitStatus.sat) elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: - self.print_status(ExitStatus.unsat) + self.print_status(OPBExitStatus.unsat) else: self.print_comment("Solver did not find any solution within the time/memory limit") - self.print_status(ExitStatus.unknown) + self.print_status(OPBExitStatus.unknown) def handle_memory_error(self, mem_limit): super().handle_memory_error(mem_limit) - self.print_status(ExitStatus.unknown) + self.print_status(OPBExitStatus.unknown) def handle_not_implemented(self, e): super().handle_not_implemented(e) - self.print_status(ExitStatus.unsupported) + self.print_status(OPBExitStatus.unsupported) def handle_exception(self, e): super().handle_exception(e) - self.print_status(ExitStatus.unknown) + self.print_status(OPBExitStatus.unknown) + + def handle_sigterm(self): + """ + Handles a SIGTERM. Gives us 1 second to finish the current job before we get killed. + """ + # Report that we haven't found a solution in time + self.print_status(OPBExitStatus.unknown) + self.print_comment("SIGTERM raised.") + return 0 + + def handle_rlimit_cpu(self): + """ + Handles a SIGXCPU. + """ + # Report that we haven't found a solution in time + self.print_status(OPBExitStatus.unknown) + self.print_comment("SIGXCPU raised.") + return 0 def parse_output_line(self, line, result): if line.startswith('s '): diff --git a/cpmpy/tools/benchmark/xcsp3.py b/cpmpy/tools/benchmark/xcsp3.py index 9601a4530..9dd9849fe 100644 --- a/cpmpy/tools/benchmark/xcsp3.py +++ b/cpmpy/tools/benchmark/xcsp3.py @@ -31,6 +31,7 @@ .. autosummary:: :nosignatures: + XCSP3ExitStatus XCSP3Benchmark ================= @@ -60,7 +61,7 @@ import xml.etree.cElementTree as ET -class ExitStatus(Enum): +class XCSP3ExitStatus(Enum): unsupported:str = "UNSUPPORTED" # instance contains an unsupported feature (e.g. a unsupported global constraint) sat:str = "SATISFIABLE" # CSP : found a solution | COP : found a solution but couldn't prove optimality optimal:str = "OPTIMUM" + chr(32) + "FOUND" # optimal COP solution found @@ -120,12 +121,12 @@ class XCSP3Benchmark(Benchmark): """ def __init__(self): - super().__init__(reader=read_xcsp3) + super().__init__(reader=read_xcsp3, exit_status=XCSP3ExitStatus) def print_comment(self, comment:str): print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) - def print_status(self, status: ExitStatus) -> None: + def print_status(self, status: XCSP3ExitStatus) -> None: print('s' + chr(32) + status.value, end="\n", flush=True) def print_value(self, value: str) -> None: @@ -139,35 +140,35 @@ def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: self.print_result() self.print_value(solution_xcsp3(s)) - self.print_status(ExitStatus.optimal) + self.print_status(XCSP3ExitStatus.optimal) elif s.status().exitstatus == CPMStatus.FEASIBLE: self.print_value(solution_xcsp3(s)) - self.print_status(ExitStatus.sat) + self.print_status(XCSP3ExitStatus.sat) elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: - self.print_status(ExitStatus.unsat) + self.print_status(XCSP3ExitStatus.unsat) else: self.print_comment("Solver did not find any solution within the time/memory limit") - self.print_status(ExitStatus.unknown) + self.print_status(XCSP3ExitStatus.unknown) def handle_memory_error(self, mem_limit): super().handle_memory_error(mem_limit) - self.print_status(ExitStatus.unknown) + self.print_status(XCSP3ExitStatus.unknown) def handle_not_implemented(self, e): super().handle_not_implemented(e) - self.print_status(ExitStatus.unsupported) + self.print_status(XCSP3ExitStatus.unsupported) def handle_exception(self, e): if isinstance(e, ParseError): if "out of memory" in e.msg: self.print_comment(f"MemoryError raised by parser.") - self.print_status(ExitStatus.unknown) + self.print_status(XCSP3ExitStatus.unknown) else: self.print_comment(f"An {type(e)} got raised by the parser: {e}") - self.print_status(ExitStatus.unknown) + self.print_status(XCSP3ExitStatus.unknown) else: super().handle_exception(e) - self.print_status(ExitStatus.unknown) + self.print_status(XCSP3ExitStatus.unknown) def parse_output_line(self, line, result): if line.startswith('s '): From 8fff25480e8bfdb9f0b7d787d26b4c143fb1fdbd Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 18:19:11 +0200 Subject: [PATCH 024/152] Validate dataset arguments --- cpmpy/tools/dataset/model/mse.py | 6 ++++++ cpmpy/tools/dataset/model/opb.py | 6 ++++++ cpmpy/tools/dataset/model/xcsp3.py | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/cpmpy/tools/dataset/model/mse.py b/cpmpy/tools/dataset/model/mse.py index ef31b0d64..3ddfebf35 100644 --- a/cpmpy/tools/dataset/model/mse.py +++ b/cpmpy/tools/dataset/model/mse.py @@ -55,6 +55,12 @@ def __init__( self.year = year self.track = track + # Check requested dataset + if not str(year).startswith('20'): + raise ValueError("Year must start with '20'") + if not track: + raise ValueError("Track must be specified, e.g. OPT-LIN, DEC-LIN, ...") + dataset_dir = self.root / str(year) / track super().__init__( diff --git a/cpmpy/tools/dataset/model/opb.py b/cpmpy/tools/dataset/model/opb.py index 40e6a282d..0915c6509 100644 --- a/cpmpy/tools/dataset/model/opb.py +++ b/cpmpy/tools/dataset/model/opb.py @@ -55,6 +55,12 @@ def __init__( self.year = year self.track = track + # Check requested dataset + if not str(year).startswith('20'): + raise ValueError("Year must start with '20'") + if not track: + raise ValueError("Track must be specified, e.g. exact-weighted, exact-unweighted, ...") + dataset_dir = self.root / str(year) / track super().__init__( diff --git a/cpmpy/tools/dataset/model/xcsp3.py b/cpmpy/tools/dataset/model/xcsp3.py index 597a2af55..21b38f35e 100644 --- a/cpmpy/tools/dataset/model/xcsp3.py +++ b/cpmpy/tools/dataset/model/xcsp3.py @@ -56,6 +56,12 @@ def __init__( self.year = year self.track = track + # Check requested dataset + if not str(year).startswith('20'): + raise ValueError("Year must start with '20'") + if not track: + raise ValueError("Track must be specified, e.g. COP, CSP, ...") + dataset_dir = self.root / str(year) / track super().__init__( From 2b4a8f02daa648e48d2e806b2c6ce98832323237 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 18:19:25 +0200 Subject: [PATCH 025/152] Check non-empty dataset --- cpmpy/tools/dataset/_base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index aa22ae930..a8954aa9f 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -36,6 +36,10 @@ def __init__( raise ValueError(f"Dataset not found. Please set download=True to download the dataset.") else: self.download() + + files = sorted(list(self.dataset_dir.glob(f"*{self.extension}"))) + if len(files) == 0: + raise ValueError("Cannot find any instances inside dataset. Is it a valid dataset? If so, please report on GitHub.") @abstractmethod def category(self) -> dict: From b68144d160c28f0da421e5f9b986b7492aed9716 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 18:24:13 +0200 Subject: [PATCH 026/152] Add feedback finished downloading --- cpmpy/tools/dataset/_base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index a8954aa9f..496780b2d 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -36,6 +36,8 @@ def __init__( raise ValueError(f"Dataset not found. Please set download=True to download the dataset.") else: self.download() + files = sorted(list(self.dataset_dir.glob(f"*{self.extension}"))) + print(f"Finished downloading {len(files)} instances") files = sorted(list(self.dataset_dir.glob(f"*{self.extension}"))) if len(files) == 0: From b08df43dba1e034e7fb88d98ad624161faf534ee Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 12 Sep 2025 18:47:43 +0200 Subject: [PATCH 027/152] Small fixes --- cpmpy/tools/benchmark/opb.py | 1 - cpmpy/tools/benchmark/xcsp3.py | 3 +-- cpmpy/tools/dataset/model/opb.py | 2 +- cpmpy/tools/dataset/model/xcsp3.py | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cpmpy/tools/benchmark/opb.py b/cpmpy/tools/benchmark/opb.py index 905d7ab0e..5c1e0f606 100644 --- a/cpmpy/tools/benchmark/opb.py +++ b/cpmpy/tools/benchmark/opb.py @@ -100,7 +100,6 @@ def print_objective(self, objective: int) -> None: def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: - self.print_result() self.print_value(solution_opb(s)) self.print_status(OPBExitStatus.optimal) elif s.status().exitstatus == CPMStatus.FEASIBLE: diff --git a/cpmpy/tools/benchmark/xcsp3.py b/cpmpy/tools/benchmark/xcsp3.py index 9dd9849fe..bb2f02410 100644 --- a/cpmpy/tools/benchmark/xcsp3.py +++ b/cpmpy/tools/benchmark/xcsp3.py @@ -138,7 +138,6 @@ def print_objective(self, objective: int) -> None: def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: - self.print_result() self.print_value(solution_xcsp3(s)) self.print_status(XCSP3ExitStatus.optimal) elif s.status().exitstatus == CPMStatus.FEASIBLE: @@ -176,7 +175,7 @@ def parse_output_line(self, line, result): elif line.startswith('v ') and result['solution'] is None: # only record first line, contains 'type' and 'cost' solution = line.split("\n")[0][2:].strip() - result['solution'] = str(solution) + result['solution'] = solution complete_solution = line if "cost" in solution: result['objective_value'] = solution.split('cost="')[-1][:-2] diff --git a/cpmpy/tools/dataset/model/opb.py b/cpmpy/tools/dataset/model/opb.py index 0915c6509..201075749 100644 --- a/cpmpy/tools/dataset/model/opb.py +++ b/cpmpy/tools/dataset/model/opb.py @@ -139,7 +139,7 @@ def download(self): tar_path.unlink() def open(self, instance: os.PathLike) -> callable: - return lzma.open if str(instance).endswith(".xz") else open + return lzma.open(instance, 'rt') if str(instance).endswith(".xz") else open(instance) if __name__ == "__main__": dataset = OPBDataset(year=2024, track="DEC-LIN", download=True) diff --git a/cpmpy/tools/dataset/model/xcsp3.py b/cpmpy/tools/dataset/model/xcsp3.py index 21b38f35e..f17a4d193 100644 --- a/cpmpy/tools/dataset/model/xcsp3.py +++ b/cpmpy/tools/dataset/model/xcsp3.py @@ -129,7 +129,7 @@ def download(self): zip_path.unlink() def open(self, instance: os.PathLike) -> callable: - return partial(lzma.open, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open + return lzma.open(instance, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open(instance) if __name__ == "__main__": From 431b065609b3772dfa0bff4aa665f1d49d903548 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 10 Oct 2025 13:39:34 +0200 Subject: [PATCH 028/152] Fix intermediate solutions and time tracking --- cpmpy/tools/benchmark/_base.py | 2 +- cpmpy/tools/benchmark/mse.py | 14 ++++++++------ cpmpy/tools/benchmark/opb.py | 3 +++ cpmpy/tools/benchmark/runner.py | 1 + cpmpy/tools/benchmark/xcsp3.py | 3 +++ 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/cpmpy/tools/benchmark/_base.py b/cpmpy/tools/benchmark/_base.py index 2e81505e1..8055d43a6 100644 --- a/cpmpy/tools/benchmark/_base.py +++ b/cpmpy/tools/benchmark/_base.py @@ -84,7 +84,7 @@ def print_comment(self, comment:str): print(comment) def print_intermediate(self, objective:int): - print("Intermediate solution:", objective) + self.print_comment("Intermediate solution:", objective) def print_result(self, s): self.print_comment(s.status()) diff --git a/cpmpy/tools/benchmark/mse.py b/cpmpy/tools/benchmark/mse.py index b7d645369..656467bf9 100644 --- a/cpmpy/tools/benchmark/mse.py +++ b/cpmpy/tools/benchmark/mse.py @@ -99,11 +99,13 @@ def print_status(self, status: MSEExitStatus) -> None: print('s' + chr(32) + status.value, end="\n", flush=True) def print_value(self, value: str) -> None: - value = value[:-2].replace("\n", "\nv" + chr(32)) + value[-2:] print('v' + chr(32) + value, end="\n", flush=True) def print_objective(self, objective: int) -> None: print('o' + chr(32) + str(objective), end="\n", flush=True) + + def print_intermediate(self, objective:int): + self.print_objective(objective) def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: @@ -159,17 +161,17 @@ def parse_output_line(self, line, result): result['solution'] = solution else: result['solution'] = result['solution'] + ' ' + str(solution) + elif line.startswith('c Solution'): + parts = line.split(', time = ') + # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines + self._sol_time = float(parts[-1].replace('s', '').rstrip()) elif line.startswith('o '): obj = int(line[2:].strip()) if result['intermediate'] is None: result['intermediate'] = [] - result['intermediate'] += [(sol_time, obj)] + result['intermediate'] += [(self._sol_time, obj)] result['objective_value'] = obj obj = None - elif line.startswith('c Solution'): - parts = line.split(', time = ') - # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines - sol_time = float(parts[-1].replace('s', '').rstrip()) elif line.startswith('c took '): # Parse timing information parts = line.split(' seconds to ') diff --git a/cpmpy/tools/benchmark/opb.py b/cpmpy/tools/benchmark/opb.py index 5c1e0f606..b92fcb257 100644 --- a/cpmpy/tools/benchmark/opb.py +++ b/cpmpy/tools/benchmark/opb.py @@ -98,6 +98,9 @@ def print_value(self, value: str) -> None: def print_objective(self, objective: int) -> None: print('o' + chr(32) + str(objective), end="\n", flush=True) + def print_intermediate(self, objective:int): + self.print_objective(objective) + def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: self.print_value(solution_opb(s)) diff --git a/cpmpy/tools/benchmark/runner.py b/cpmpy/tools/benchmark/runner.py index 933dac132..6bc85e6ae 100644 --- a/cpmpy/tools/benchmark/runner.py +++ b/cpmpy/tools/benchmark/runner.py @@ -94,6 +94,7 @@ def wrapper(instance_runner, conn, kwargs, verbose): sys.stdout = Tee(original_stdout, pipe_writer) # forward to pipe and console try: + kwargs["verbose"] = verbose instance_runner.run(**kwargs) conn.send({"status": "ok"}) except TimeoutError: diff --git a/cpmpy/tools/benchmark/xcsp3.py b/cpmpy/tools/benchmark/xcsp3.py index bb2f02410..47d0289e4 100644 --- a/cpmpy/tools/benchmark/xcsp3.py +++ b/cpmpy/tools/benchmark/xcsp3.py @@ -136,6 +136,9 @@ def print_value(self, value: str) -> None: def print_objective(self, objective: int) -> None: print('o' + chr(32) + str(objective), end="\n", flush=True) + def print_intermediate(self, objective:int): + self.print_objective(objective) + def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: self.print_value(solution_xcsp3(s)) From 7d98c354f8668ca61b1ce9950564d22fa0cf66fe Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 10 Oct 2025 13:39:54 +0200 Subject: [PATCH 029/152] Increase intermediate solution time resolution --- cpmpy/tools/benchmark/_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpmpy/tools/benchmark/_base.py b/cpmpy/tools/benchmark/_base.py index 8055d43a6..b7171c6f6 100644 --- a/cpmpy/tools/benchmark/_base.py +++ b/cpmpy/tools/benchmark/_base.py @@ -156,7 +156,7 @@ def on_solution_callback(self): current_time = time.time() obj = int(self.ObjectiveValue()) - _self.print_comment('Solution %i, time = %0.2fs' % + _self.print_comment('Solution %i, time = %0.4fs' % (self.__solution_count, current_time - self.__start_time)) _self.print_intermediate(obj) self.__solution_count += 1 @@ -286,7 +286,7 @@ def callback(self, *args, **kwargs): if model.cbGet(GRB.Callback.MIP_SOLCNT) > self.__solution_count: # do we have a new solution? obj = int(model.cbGet(GRB.Callback.MIP_OBJBST)) - _self.print_comment('Solution %i, time = %0.2fs' % + _self.print_comment('Solution %i, time = %0.4fs' % (self.__solution_count, current_time - self.__start_time)) _self.print_intermediate(obj) self.__solution_count = model.cbGet(GRB.Callback.MIP_SOLCNT) @@ -324,7 +324,7 @@ def result_found(self, solver, sres): current_time = time.time() obj = sres.get_objective_value() if obj is not None: - _self.print_comment('Solution %i, time = %0.2fs' % + _self.print_comment('Solution %i, time = %0.4fs' % (self.__solution_count, current_time - self.__start_time)) _self.print_intermediate(obj) self.__solution_count += 1 @@ -472,7 +472,7 @@ def run( time_parse = time.time() model = self.read_instance(instance, open=open) time_parse = time.time() - time_parse - if verbose: self.print_comment(f"took {time_parse:.4f} seconds to parse model [{instance}]") + if verbose: self.print_comment(f"took {time_parse:.4f} seconds to parse model") if time_limit and time_limit < _wall_time(p): raise TimeoutError("Time's up after parse") From 4664051472c3a9e59bb2c7769592737d614329ff Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 10 Oct 2025 15:26:44 +0200 Subject: [PATCH 030/152] Missing default return argument --- cpmpy/tools/benchmark/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpmpy/tools/benchmark/_base.py b/cpmpy/tools/benchmark/_base.py index b7171c6f6..11d17ed42 100644 --- a/cpmpy/tools/benchmark/_base.py +++ b/cpmpy/tools/benchmark/_base.py @@ -421,7 +421,7 @@ def solver_arguments( return self.cpo_arguments(model=model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) else: self.print_comment(f"setting parameters of {solver} is not (yet) supported") - return dict() + return dict(), None def run( self, From 582fc963e2a5eb6e5189c32ed23f6584fa08d670 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 17 Oct 2025 09:10:31 +0200 Subject: [PATCH 031/152] Only import "resource" when supported --- cpmpy/tools/benchmark/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpmpy/tools/benchmark/__init__.py b/cpmpy/tools/benchmark/__init__.py index 54aa8031f..ce383c1de 100644 --- a/cpmpy/tools/benchmark/__init__.py +++ b/cpmpy/tools/benchmark/__init__.py @@ -1,5 +1,3 @@ - -import resource import sys import time import warnings @@ -22,12 +20,14 @@ def set_memory_limit(mem_limit): soft = max(_mib_as_bytes(mem_limit) - _mib_as_bytes(MEMORY_BUFFER_SOFT), _mib_as_bytes(MEMORY_BUFFER_SOFT)) hard = max(_mib_as_bytes(mem_limit) - _mib_as_bytes(MEMORY_BUFFER_HARD), _mib_as_bytes(MEMORY_BUFFER_HARD)) if sys.platform != "win32": + import resource resource.setrlimit(resource.RLIMIT_AS, (soft, hard)) # limit memory in number of bytes else: warnings.warn("Memory limits using `resource` are not supported on Windows. Skipping hard limit.") def disable_memory_limit(): if sys.platform != "win32": + import resource soft, hard = resource.getrlimit(resource.RLIMIT_AS) # set a very high soft limit resource.setrlimit(resource.RLIMIT_AS, (hard, hard)) @@ -38,6 +38,7 @@ def set_time_limit(time_limit, verbose:bool=False): """ if time_limit is not None: if sys.platform != "win32": + import resource soft = time_limit hard = resource.RLIM_INFINITY resource.setrlimit(resource.RLIMIT_CPU, (soft, hard)) From 2eea41c23d6bd54db0dc1d5aa399a3da8920354d Mon Sep 17 00:00:00 2001 From: OrestisLomis Date: Thu, 23 Oct 2025 17:49:01 +0200 Subject: [PATCH 032/152] remove var x0 which is not used in opb --- cpmpy/tools/opb/parser.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpmpy/tools/opb/parser.py b/cpmpy/tools/opb/parser.py index e300a2752..f63db7c7d 100644 --- a/cpmpy/tools/opb/parser.py +++ b/cpmpy/tools/opb/parser.py @@ -66,10 +66,10 @@ def _parse_term(line, vars): for v in vars_str.split(): if v.startswith("~x"): - idx = int(v[2:]) # remove "~x" + idx = int(v[2:]) - 1 # remove "~x" and opb is 1-based indexing factors.append(~vars[idx]) else: - idx = int(v[1:]) # remove "x" + idx = int(v[1:]) - 1 # remove "x" and opb is 1-based indexing factors.append(vars[idx]) term = int(w) * reduce(mul, factors, 1) # create weighted term @@ -162,13 +162,15 @@ def read_opb(opb: Union[str, os.PathLike], open=open) -> cp.Model: header = HEADER_RE.match(_line) if not header: raise ValueError(f"Missing or incorrect header: \n0: {line}1: {_line}2: ...") - nr_vars = int(header.group(2)) + 1 + nr_vars = int(header.group(2)) # Generator without comment lines reader = (l for l in map(str.strip, f) if l and l[0] != '*') # CPMpy objects vars = cp.boolvar(shape=nr_vars, name="x") + if nr_vars == 1: + vars = cp.cpm_array([vars]) # ensure vars is indexable even for single variable case model = cp.Model() # Special case for first line -> might contain objective function From 6111fc43707d6455ce56f0821458638241c3724b Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 24 Oct 2025 15:25:26 +0200 Subject: [PATCH 033/152] rcpsp dataset and benchmark --- cpmpy/tools/benchmark/_base.py | 2 +- cpmpy/tools/benchmark/psplib.py | 213 ++++++++++++++++++++++++++ cpmpy/tools/dataset/problem/psplib.py | 119 ++++++++++++++ cpmpy/tools/rcpsp/__init__.py | 20 +++ cpmpy/tools/rcpsp/parser.py | 171 +++++++++++++++++++++ 5 files changed, 524 insertions(+), 1 deletion(-) create mode 100644 cpmpy/tools/benchmark/psplib.py create mode 100644 cpmpy/tools/dataset/problem/psplib.py create mode 100644 cpmpy/tools/rcpsp/__init__.py create mode 100644 cpmpy/tools/rcpsp/parser.py diff --git a/cpmpy/tools/benchmark/_base.py b/cpmpy/tools/benchmark/_base.py index 11d17ed42..4e718bc45 100644 --- a/cpmpy/tools/benchmark/_base.py +++ b/cpmpy/tools/benchmark/_base.py @@ -59,7 +59,7 @@ class Benchmark(ABC): It is designed to be extended or customized for specific benchmarking needs. """ - def __init__(self, reader:callable, exit_status:Enum): + def __init__(self, reader:callable, exit_status:Enum=ExitStatus): """ Arguments: reader (callable): A parser from a model format to a CPMPy model. diff --git a/cpmpy/tools/benchmark/psplib.py b/cpmpy/tools/benchmark/psplib.py new file mode 100644 index 000000000..4fab0c99c --- /dev/null +++ b/cpmpy/tools/benchmark/psplib.py @@ -0,0 +1,213 @@ +""" +PSPLIB as a CPMpy benchmark + +This module provides a benchmarking framework for running CPMpy on PSPLIB +instances. + +Command-line Interface +---------------------- +This script can be run directly to benchmark solvers on PSPLIB datasets. + +Usage: + python psplib.py --year 2024 --track exact-weighted --solver ortools + +Arguments: + --variant Problem variant (e.g., rcpsp). + --family Problem family (e.g., j30, j120, ...) + --solver Solver name (e.g., ortools, exact, choco, ...). + --workers Number of parallel workers to use. + --time-limit Time limit in seconds per instance. + --mem-limit Memory limit in MB per instance. + --cores Number of cores to assign to a single instance. + --output-dir Output directory for CSV files. + --verbose Show solver output if set. + --intermediate Report intermediate solutions if supported. + +=============== +List of classes +=============== + +.. autosummary:: + :nosignatures: + + MSEExitStatus + MSEBenchmark + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + solution_mse +""" + +import warnings +import argparse +from enum import Enum +from pathlib import Path +from datetime import datetime + +# CPMpy +from cpmpy.tools.benchmark.runner import benchmark_runner +from cpmpy.tools.benchmark._base import Benchmark, ExitStatus +from cpmpy.tools.rcpsp import read_rcpsp +from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus + + +def solution_psplib(model): + """ + Convert a CPMpy model solution into the solution string format. + + Arguments: + model (cp.solvers.SolverInterface): The solver-specific model for which to print its solution + + Returns: + str: formatted solution string. + """ + variables = {var.name: var.value() for var in model.user_vars if var.name[:2] not in ["IV", "BV", "B#"]} # dirty workaround for all missed aux vars in user vars TODO fix with Ignace + return str(variables) + +class PSPLIBBenchmark(Benchmark): + + """ + PSPLIB as a CPMpy benchmark. + """ + + def __init__(self): + super().__init__(reader=read_rcpsp) # TODO: reader should depend on problem variant + + def print_comment(self, comment:str): + print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) + + def print_status(self, status: ExitStatus) -> None: + print('s' + chr(32) + status.value, end="\n", flush=True) + + def print_value(self, value: str) -> None: + print('v' + chr(32) + value, end="\n", flush=True) + + def print_objective(self, objective: int) -> None: + print('o' + chr(32) + str(objective), end="\n", flush=True) + + def print_intermediate(self, objective:int): + self.print_objective(objective) + + def print_result(self, s): + if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_value(solution_psplib(s)) + self.print_status(ExitStatus.optimal) + elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_value(solution_psplib(s)) + self.print_status(ExitStatus.sat) + elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: + self.print_status(ExitStatus.unsat) + else: + self.print_comment("Solver did not find any solution within the time/memory limit") + self.print_status(ExitStatus.unknown) + + def handle_memory_error(self, mem_limit): + super().handle_memory_error(mem_limit) + self.print_status(ExitStatus.unknown) + + def handle_not_implemented(self, e): + super().handle_not_implemented(e) + self.print_status(ExitStatus.unsupported) + + def handle_exception(self, e): + super().handle_exception(e) + self.print_status(ExitStatus.unknown) + + + def handle_sigterm(self): + """ + Handles a SIGTERM. Gives us 1 second to finish the current job before we get killed. + """ + # Report that we haven't found a solution in time + self.print_status(ExitStatus.unknown) + self.print_comment("SIGTERM raised.") + return 0 + + def handle_rlimit_cpu(self): + """ + Handles a SIGXCPU. + """ + # Report that we haven't found a solution in time + self.print_status(ExitStatus.unknown) + self.print_comment("SIGXCPU raised.") + return 0 + + def parse_output_line(self, line, result): + if line.startswith('s '): + result['status'] = line[2:].strip() + elif line.startswith('v '): + # only record first line, contains 'type' and 'cost' + solution = line.split("\n")[0][2:].strip() + if solution not in result: + result['solution'] = solution + else: + result['solution'] = result['solution'] + ' ' + str(solution) + elif line.startswith('c Solution'): + parts = line.split(', time = ') + # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines + self.sol_time = float(parts[-1].replace('s', '').rstrip()) + elif line.startswith('o '): + obj = int(line[2:].strip()) + if result['intermediate'] is None: + result['intermediate'] = [] + result['intermediate'] += [(self.sol_time, obj)] + result['objective_value'] = obj + obj = None + elif line.startswith('c took '): + # Parse timing information + parts = line.split(' seconds to ') + if len(parts) == 2: + time_val = float(parts[0].replace('c took ', '')) + action = parts[1].strip() + if action.startswith('parse'): + result['time_parse'] = time_val + elif action.startswith('convert'): + result['time_model'] = time_val + elif action.startswith('post'): + result['time_post'] = time_val + elif action.startswith('solve'): + result['time_solve'] = time_val + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Benchmark solvers on PSPLIB instances') + parser.add_argument('--variant', type=str, required=True, help='Problem variant (e.g., rcpsp)') + parser.add_argument('--family', type=str, required=True, help='Problem family (e.g., j30, j120, ...)') + parser.add_argument('--solver', type=str, required=True, help='Solver name (e.g., ortools, exact, choco, ...)') + parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers') + parser.add_argument('--time-limit', type=int, default=300, help='Time limit in seconds per instance') + parser.add_argument('--mem-limit', type=int, default=8192, help='Memory limit in MB per instance') + parser.add_argument('--cores', type=int, default=1, help='Number of cores to assign tp a single instance') + parser.add_argument('--output-dir', type=str, default='results', help='Output directory for CSV files') + parser.add_argument('--verbose', action='store_true', help='Show solver output') + parser.add_argument('--intermediate', action='store_true', help='Report on intermediate solutions') + # parser.add_argument('--checker-path', type=str, default=None, + # help='Path to the XCSP3 solution checker JAR file') + args = parser.parse_args() + + if not args.verbose: + warnings.filterwarnings("ignore") + + # Load benchmark instances (as a dataset) + from cpmpy.tools.dataset.problem.psplib import PSPLibDataset + dataset = PSPLibDataset(variant=args.variant, family=args.family, download=True) + + # Create output directory + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Get current timestamp in a filename-safe format + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Define output file path with timestamp + output_file = str(output_dir / "psplib" / f"psplib_{args.variant}_{args.family}_{args.solver}_{timestamp}.csv") + + # Run the benchmark + instance_runner = PSPLIBBenchmark() + output_file = benchmark_runner(dataset=dataset, instance_runner=instance_runner, output_file=output_file, **vars(args)) + print(f"Results added to {output_file}") diff --git a/cpmpy/tools/dataset/problem/psplib.py b/cpmpy/tools/dataset/problem/psplib.py new file mode 100644 index 000000000..b1cbf70f6 --- /dev/null +++ b/cpmpy/tools/dataset/problem/psplib.py @@ -0,0 +1,119 @@ +import os +import pathlib +from typing import Tuple, Any +from urllib.request import urlretrieve +from urllib.error import HTTPError, URLError +import zipfile + +class PSPLibDataset(object): # torch.utils.data.Dataset compatible + + """ + PSPlib Dataset in a PyTorch compatible format. + + Arguments: + root (str): Root directory containing the psplib instances (if 'download', instances will be downloaded to this location) + variant (str): scheduling variant (only 'rcpsp' is supported for now) + family (str): family name (e.g. j30, j60, etc...) + transform (callable, optional): Optional transform to be applied on the instance data + target_transform (callable, optional): Optional transform to be applied on the file path + download (bool): If True, downloads the dataset from the internet and puts it in `root` directory + """ + + def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False): + """ + Initialize the PSPLib Dataset. + """ + + self.root = pathlib.Path(root) + self.variant = variant + self.family = family + self.transform = transform + self.target_transform = target_transform + self.family_dir = pathlib.Path(os.path.join(self.root, variant, family)) + + self.families = dict( + rcpsp = ["j30", "j60", "j90", "j120"] + ) + self.family_codes = dict(rcpsp="sm", mrcpsp="mm") + + if variant != "rcpsp": + raise ValueError("Only 'rcpsp' variant is supported for now") + if family not in self.families[variant]: + raise ValueError(f"Unknown problem family. Must be any of {','.join(self.families[variant])}") + # Create root directory if it doesn't exist + self.root.mkdir(parents=True, exist_ok=True) + + if not self.family_dir.exists(): + if not download: + raise ValueError(f"Dataset for variant {variant} and family {family} not found. Please set download=True to download the dataset.") + else: + print(f"Downloading PSPLib {variant} {family} instances...") + + zip_name = f"{family}.{self.family_codes[variant]}.zip" + url = f"https://www.om-db.wi.tum.de/psplib/files/" + + url_path = url + zip_name + zip_path = self.root / zip_name + + try: + urlretrieve(url_path, str(zip_path)) + except (HTTPError, URLError) as e: + raise ValueError(f"No dataset available for variant {variant} and family {family}. Error: {str(e)}") + + # make directory and extract files + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + # Create track folder in root directory, parents=True ensures recursive creation + self.family_dir.mkdir(parents=True, exist_ok=True) + + # Extract files + for file_info in zip_ref.infolist(): + # Extract file to family_dir, removing main_folder/track prefix + filename = pathlib.Path(file_info.filename).name + with zip_ref.open(file_info) as source, open(self.family_dir / filename, 'wb') as target: + target.write(source.read()) + # Clean up the zip file + zip_path.unlink() + + def open(self, instance: os.PathLike) -> callable: + return open(instance, "r") + + + def __len__(self) -> int: + """Return the total number of instances.""" + return len(list(self.family_dir.glob(f"*.{self.family_codes[self.variant]}"))) + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Get a single RCPSP instance filename and metadata. + + Args: + index (int): Index of the instance to retrieve + + Returns: + Tuple[Any, Any]: A tuple containing: + - The filename of the instance + - Metadata dictionary with file name, track, year etc. + """ + if index < 0 or index >= len(self): + raise IndexError("Index out of range") + + # Get all instance files and sort for deterministic behavior # TODO: use natsort instead? + files = sorted(list(self.family_dir.glob(f"*.{self.family_codes[self.variant]}"))) + file_path = files[index] + + filename = str(file_path) + if self.transform: + # does not need to remain a filename... + filename = self.transform(filename) + + # Basic metadata about the instance + metadata = dict( + variant = self.variant, + family = self.family, + name = file_path.stem + ) + + if self.target_transform: + metadata = self.target_transform(metadata) + + return filename, metadata \ No newline at end of file diff --git a/cpmpy/tools/rcpsp/__init__.py b/cpmpy/tools/rcpsp/__init__.py new file mode 100644 index 000000000..b24d99980 --- /dev/null +++ b/cpmpy/tools/rcpsp/__init__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## __init__.py +## +""" +Set of utilities for working with psplib-formatted rcpsp CP models. + + +================== +List of submodules +================== + +.. autosummary:: + :nosignatures: + + parser +""" + +from .parser import read_rcpsp diff --git a/cpmpy/tools/rcpsp/parser.py b/cpmpy/tools/rcpsp/parser.py new file mode 100644 index 000000000..cadc32482 --- /dev/null +++ b/cpmpy/tools/rcpsp/parser.py @@ -0,0 +1,171 @@ +""" +Parser for the PSPLIB RCPSP format. + + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read_rcpsp +""" + + +import os +import sys +import lzma +import argparse +import cpmpy as cp +from io import StringIO +from typing import Union + + +_std_open = open +def read_rcpsp(rcpsp: Union[str, os.PathLike], open=open) -> cp.Model: + """ + Parser for PSPLIB RCPSP format. Reads in an instance and returns its matching CPMpy model. + + Arguments: + rcpsp (str or os.PathLike): + - A file path to a PSPLIB RCPSP file + - OR a string containing the RCPSP content directly + open: (callable): + If rcpsp is the path to a file, a callable to "open" that file (default=python standard library's 'open'). + + Returns: + cp.Model: The CPMpy model of the PSPLIB RCPSP instance. + """ + # If rcpsp is a path to a file -> open file + if isinstance(rcpsp, (str, os.PathLike)) and os.path.exists(rcpsp): + if open is not None: + f = open(rcpsp) + else: + f = _std_open(rcpsp, "rt") + # If rcpsp is a string containing a model -> create a memory-mapped file + else: + f = StringIO(rcpsp) + + + table, capacities = _parse_rcpsp(f) + model, (start, end, makespan) = _model_rcpsp(job_data=table, capacities=capacities) + return model + +def _parse_rcpsp(f): + + data = dict() + + line = f.readline() + while not line.startswith("PRECEDENCE RELATIONS:"): + line = f.readline() + + f.readline() # skip keyword line + line = f.readline() # first line of table, skip + while not line.startswith("*****"): + jobnr, n_modes, n_succ, *succ = [int(x) for x in line.split(" ") if len(x.strip())] + assert len(succ) == n_succ, "Expected %d successors for job %d, got %d" % (n_succ, jobnr, len(succ)) + data[jobnr] = dict(num_modes=n_modes, successors=succ) + line = f.readline() + + # skip to job info + while not line.startswith("REQUESTS/DURATIONS:"): + line = f.readline() + + line = f.readline() + _j, _m, _d, *_r = [x.strip() for x in line.split(" ") if len(x.strip())] # first line of table + resource_names = [f"{_r[i]}{_r[i+1]}" for i in range(0,len(_r),2)] + line = f.readline() # first line of table + if line.startswith("----") or line.startswith("*****"): # intermediate line in table... + line = f.readline() # skip + + while not line.startswith("*****"): + jobnr, mode, duration, *resources = [int(x) for x in line.split(" ") if len(x.strip())] + assert len(resources) == len(resource_names), "Expected %d resources for job %d, got %d" % (len(resource_names), jobnr, len(resources)) + data[jobnr].update(dict(mode=mode, duration=duration)) + data[jobnr].update({name : req for name, req in zip(resource_names, resources)}) + line = f.readline() + + # read resource availabilities + while not line.startswith("RESOURCEAVAILABILITIES:"): + line = f.readline() + + f.readline() # skip header + capacities = [int(x) for x in f.readline().split(" ") if len(x)] + + import pandas as pd + df =pd.DataFrame([dict(jobnr=k ,**info) for k, info in data.items()], + columns=["jobnr", "mode", "duration", "successors", *resource_names]) + df.set_index("jobnr", inplace=True) + + return df, dict(zip(resource_names, capacities)) + +def _model_rcpsp(job_data, capacities): + + model = cp.Model() + + horizon = job_data.duration.sum() # worst case, all jobs sequential on a machine + makespan = cp.intvar(0, horizon, name="makespan") + + start = cp.intvar(0, horizon, name="start", shape=len(job_data)) + end = cp.intvar(0, horizon, name="end", shape=len(job_data)) + + # ensure capacity is not exceeded + for rescource, capa in capacities.items(): + model += cp.Cumulative( + start = start, + duration = job_data['duration'].tolist(), + end = end, + demand = job_data[rescource].tolist(), + capacity = capa + ) + + # enforce precedences + for idx, (jobnr, info) in enumerate(job_data.iterrows()): + for succ in info['successors']: + model += end[idx] <= start[succ-1] # job ids start at idx 1 + + model += end <= makespan + model.minimize(makespan) + + return model, (start, end, makespan) + + +def main(): + parser = argparse.ArgumentParser(description="Parse and solve a PSPLIB RCPSP model using CPMpy") + parser.add_argument("model", help="Path to a PSPLIB RCPSP file (or raw RCPSP string if --string is given)") + parser.add_argument("-s", "--solver", default=None, help="Solver name to use (default: CPMpy's default)") + parser.add_argument("--string", action="store_true", help="Interpret the first argument (model) as a raw RCPSP string instead of a file path") + parser.add_argument("-t", "--time-limit", type=int, default=None, help="Time limit for the solver in seconds (default: no limit)") + args = parser.parse_args() + + # Build the CPMpy model + try: + if args.string: + model = read_rcpsp(args.model) + else: + model = read_rcpsp(os.path.expanduser(args.model)) + except Exception as e: + sys.stderr.write(f"Error reading model: {e}\n") + sys.exit(1) + + # Solve the model + try: + if args.solver: + result = model.solve(solver=args.solver, time_limit=args.time_limit) + else: + result = model.solve(time_limit=args.time_limit) + except Exception as e: + sys.stderr.write(f"Error solving model: {e}\n") + sys.exit(1) + + # Print results + print("Status:", model.status()) + if result is not None: + if model.has_objective(): + print("Objective:", model.objective_value()) + else: + print("No solution found.") + +if __name__ == "__main__": + main() \ No newline at end of file From af36c877c9d61eae1b595aa1650a4d42caebf9a9 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 24 Oct 2025 15:26:11 +0200 Subject: [PATCH 034/152] opb fix intermediate solutions --- cpmpy/tools/benchmark/opb.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpmpy/tools/benchmark/opb.py b/cpmpy/tools/benchmark/opb.py index b92fcb257..5c0c222cd 100644 --- a/cpmpy/tools/benchmark/opb.py +++ b/cpmpy/tools/benchmark/opb.py @@ -154,17 +154,17 @@ def parse_output_line(self, line, result): result['solution'] = solution else: result['solution'] = result['solution'] + ' ' + str(solution) + elif line.startswith('c Solution'): + parts = line.split(', time = ') + # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines + self.sol_time = float(parts[-1].replace('s', '').rstrip()) elif line.startswith('o '): obj = int(line[2:].strip()) if result['intermediate'] is None: result['intermediate'] = [] - result['intermediate'] += [(sol_time, obj)] + result['intermediate'] += [(self.sol_time, obj)] result['objective_value'] = obj obj = None - elif line.startswith('c Solution'): - parts = line.split(', time = ') - # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines - sol_time = float(parts[-1].replace('s', '').rstrip()) elif line.startswith('c took '): # Parse timing information parts = line.split(' seconds to ') From a834387f7900d7d0289267d57800e2f63bb3824c Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 24 Oct 2025 15:33:43 +0200 Subject: [PATCH 035/152] update docstrings --- cpmpy/tools/dataset/problem/psplib.py | 35 ++++++++++++++++++++------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/cpmpy/tools/dataset/problem/psplib.py b/cpmpy/tools/dataset/problem/psplib.py index b1cbf70f6..89f0e93c7 100644 --- a/cpmpy/tools/dataset/problem/psplib.py +++ b/cpmpy/tools/dataset/problem/psplib.py @@ -1,3 +1,8 @@ +""" +PSPlib Dataset + +https://www.om-db.wi.tum.de/psplib/getdata_sm.html +""" import os import pathlib from typing import Tuple, Any @@ -10,18 +15,25 @@ class PSPLibDataset(object): # torch.utils.data.Dataset compatible """ PSPlib Dataset in a PyTorch compatible format. - Arguments: - root (str): Root directory containing the psplib instances (if 'download', instances will be downloaded to this location) - variant (str): scheduling variant (only 'rcpsp' is supported for now) - family (str): family name (e.g. j30, j60, etc...) - transform (callable, optional): Optional transform to be applied on the instance data - target_transform (callable, optional): Optional transform to be applied on the file path - download (bool): If True, downloads the dataset from the internet and puts it in `root` directory + More information on PSPlib can be found here: https://www.om-db.wi.tum.de/psplib/main.html """ def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False): """ - Initialize the PSPLib Dataset. + Constructor for a dataset object for PSPlib. + + Arguments: + root (str): Root directory containing the psplib instances (if 'download', instances will be downloaded to this location) + variant (str): scheduling variant (only 'rcpsp' is supported for now) + family (str): family name (e.g. j30, j60, etc...) + transform (callable, optional): Optional transform to be applied on the instance data + target_transform (callable, optional): Optional transform to be applied on the file path + download (bool): If True, downloads the dataset from the internet and puts it in `root` directory + + + Raises: + ValueError: If the dataset directory does not exist and `download=False`, + or if the requested variant/family combination is not available. """ self.root = pathlib.Path(root) @@ -116,4 +128,9 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: if self.target_transform: metadata = self.target_transform(metadata) - return filename, metadata \ No newline at end of file + return filename, metadata + +if __name__ == "__main__": + dataset = PSPLibDataset(variant="rcpsp", family="j30", download=True) + print("Dataset size:", len(dataset)) + print("Instance 0:", dataset[0]) \ No newline at end of file From 8805cad7fab38bc74d0d7b05698ab406469e8706 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 24 Oct 2025 15:49:29 +0200 Subject: [PATCH 036/152] Fix more docstring --- cpmpy/tools/benchmark/psplib.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpmpy/tools/benchmark/psplib.py b/cpmpy/tools/benchmark/psplib.py index 4fab0c99c..26046cf84 100644 --- a/cpmpy/tools/benchmark/psplib.py +++ b/cpmpy/tools/benchmark/psplib.py @@ -9,7 +9,7 @@ This script can be run directly to benchmark solvers on PSPLIB datasets. Usage: - python psplib.py --year 2024 --track exact-weighted --solver ortools + python psplib.py --year 2024 --variant rcpsp --family j30 Arguments: --variant Problem variant (e.g., rcpsp). @@ -30,8 +30,7 @@ .. autosummary:: :nosignatures: - MSEExitStatus - MSEBenchmark + PSPLIBBenchmark ================= List of functions @@ -40,7 +39,7 @@ .. autosummary:: :nosignatures: - solution_mse + solution_psplib """ import warnings From ce6b6bcc51604e24014c3ec4a9e482e46606463e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 24 Oct 2025 15:50:01 +0200 Subject: [PATCH 037/152] Add JSPLib dataset and benchmark --- cpmpy/tools/benchmark/jsplib.py | 209 ++++++++++++++++++++++++ cpmpy/tools/dataset/problem/jsplib.py | 218 ++++++++++++++++++++++++++ cpmpy/tools/jsplib/__init__.py | 20 +++ cpmpy/tools/jsplib/parser.py | 148 +++++++++++++++++ 4 files changed, 595 insertions(+) create mode 100644 cpmpy/tools/benchmark/jsplib.py create mode 100644 cpmpy/tools/dataset/problem/jsplib.py create mode 100644 cpmpy/tools/jsplib/__init__.py create mode 100644 cpmpy/tools/jsplib/parser.py diff --git a/cpmpy/tools/benchmark/jsplib.py b/cpmpy/tools/benchmark/jsplib.py new file mode 100644 index 000000000..30c99da79 --- /dev/null +++ b/cpmpy/tools/benchmark/jsplib.py @@ -0,0 +1,209 @@ +""" +JSPLib as a CPMpy benchmark + +This module provides a benchmarking framework for running CPMpy on JSPLib +instances. + +Command-line Interface +---------------------- +This script can be run directly to benchmark solvers on JSPLib datasets. + +Usage: + python jsplib.py --solver ortools + +Arguments: + --solver Solver name (e.g., ortools, exact, choco, ...). + --workers Number of parallel workers to use. + --time-limit Time limit in seconds per instance. + --mem-limit Memory limit in MB per instance. + --cores Number of cores to assign to a single instance. + --output-dir Output directory for CSV files. + --verbose Show solver output if set. + --intermediate Report intermediate solutions if supported. + +=============== +List of classes +=============== + +.. autosummary:: + :nosignatures: + + MSEExitStatus + MSEBenchmark + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + solution_mse +""" + +import warnings +import argparse +from enum import Enum +from pathlib import Path +from datetime import datetime + +# CPMpy +from cpmpy.tools.benchmark.runner import benchmark_runner +from cpmpy.tools.benchmark._base import Benchmark, ExitStatus +from cpmpy.tools.jsplib import read_jsplib +from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus + + +def solution_psplib(model): + """ + Convert a CPMpy model solution into the solution string format. + + Arguments: + model (cp.solvers.SolverInterface): The solver-specific model for which to print its solution + + Returns: + str: formatted solution string. + """ + variables = {var.name: var.value() for var in model.user_vars if var.name[:2] not in ["IV", "BV", "B#"]} # dirty workaround for all missed aux vars in user vars TODO fix with Ignace + return str(variables) + +class JSPLibBenchmark(Benchmark): + + """ + PSPLIB as a CPMpy benchmark. + """ + + def __init__(self): + super().__init__(reader=read_jsplib) + + def print_comment(self, comment:str): + print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) + + def print_status(self, status: ExitStatus) -> None: + print('s' + chr(32) + status.value, end="\n", flush=True) + + def print_value(self, value: str) -> None: + print('v' + chr(32) + value, end="\n", flush=True) + + def print_objective(self, objective: int) -> None: + print('o' + chr(32) + str(objective), end="\n", flush=True) + + def print_intermediate(self, objective:int): + self.print_objective(objective) + + def print_result(self, s): + if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_value(solution_psplib(s)) + self.print_status(ExitStatus.optimal) + elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_value(solution_psplib(s)) + self.print_status(ExitStatus.sat) + elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: + self.print_status(ExitStatus.unsat) + else: + self.print_comment("Solver did not find any solution within the time/memory limit") + self.print_status(ExitStatus.unknown) + + def handle_memory_error(self, mem_limit): + super().handle_memory_error(mem_limit) + self.print_status(ExitStatus.unknown) + + def handle_not_implemented(self, e): + super().handle_not_implemented(e) + self.print_status(ExitStatus.unsupported) + + def handle_exception(self, e): + super().handle_exception(e) + self.print_status(ExitStatus.unknown) + + + def handle_sigterm(self): + """ + Handles a SIGTERM. Gives us 1 second to finish the current job before we get killed. + """ + # Report that we haven't found a solution in time + self.print_status(ExitStatus.unknown) + self.print_comment("SIGTERM raised.") + return 0 + + def handle_rlimit_cpu(self): + """ + Handles a SIGXCPU. + """ + # Report that we haven't found a solution in time + self.print_status(ExitStatus.unknown) + self.print_comment("SIGXCPU raised.") + return 0 + + def parse_output_line(self, line, result): + if line.startswith('s '): + result['status'] = line[2:].strip() + elif line.startswith('v '): + # only record first line, contains 'type' and 'cost' + solution = line.split("\n")[0][2:].strip() + if solution not in result: + result['solution'] = solution + else: + result['solution'] = result['solution'] + ' ' + str(solution) + elif line.startswith('c Solution'): + parts = line.split(', time = ') + # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines + self.sol_time = float(parts[-1].replace('s', '').rstrip()) + elif line.startswith('o '): + obj = int(line[2:].strip()) + if result['intermediate'] is None: + result['intermediate'] = [] + result['intermediate'] += [(self.sol_time, obj)] + result['objective_value'] = obj + obj = None + elif line.startswith('c took '): + # Parse timing information + parts = line.split(' seconds to ') + if len(parts) == 2: + time_val = float(parts[0].replace('c took ', '')) + action = parts[1].strip() + if action.startswith('parse'): + result['time_parse'] = time_val + elif action.startswith('convert'): + result['time_model'] = time_val + elif action.startswith('post'): + result['time_post'] = time_val + elif action.startswith('solve'): + result['time_solve'] = time_val + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Benchmark solvers on JSPLib instances') + parser.add_argument('--solver', type=str, required=True, help='Solver name (e.g., ortools, exact, choco, ...)') + parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers') + parser.add_argument('--time-limit', type=int, default=300, help='Time limit in seconds per instance') + parser.add_argument('--mem-limit', type=int, default=8192, help='Memory limit in MB per instance') + parser.add_argument('--cores', type=int, default=1, help='Number of cores to assign tp a single instance') + parser.add_argument('--output-dir', type=str, default='results', help='Output directory for CSV files') + parser.add_argument('--verbose', action='store_true', help='Show solver output') + parser.add_argument('--intermediate', action='store_true', help='Report on intermediate solutions') + # parser.add_argument('--checker-path', type=str, default=None, + # help='Path to the XCSP3 solution checker JAR file') + args = parser.parse_args() + + if not args.verbose: + warnings.filterwarnings("ignore") + + # Load benchmark instances (as a dataset) + from cpmpy.tools.dataset.problem.jsplib import JSPLibDataset + dataset = JSPLibDataset(download=True) + + # Create output directory + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Get current timestamp in a filename-safe format + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Define output file path with timestamp + output_file = str(output_dir / "jsplib" / f"psplib_{args.solver}_{timestamp}.csv") + + # Run the benchmark + instance_runner = JSPLibBenchmark() + output_file = benchmark_runner(dataset=dataset, instance_runner=instance_runner, output_file=output_file, **vars(args)) + print(f"Results added to {output_file}") diff --git a/cpmpy/tools/dataset/problem/jsplib.py b/cpmpy/tools/dataset/problem/jsplib.py new file mode 100644 index 000000000..54cba2890 --- /dev/null +++ b/cpmpy/tools/dataset/problem/jsplib.py @@ -0,0 +1,218 @@ +""" +PyTorch-style Dataset for Jobshop instances from JSPLib + +Simply create a dataset instance and start iterating over its contents: +The `metadata` contains usefull information about the current problem instance. + +https://github.com/tamy0612/JSPLIB +""" +import os +import json +import pathlib +from os.path import join +from typing import Tuple, Any +from urllib.request import urlretrieve +from urllib.error import HTTPError, URLError +import zipfile +import numpy as np + +from matplotlib import pyplot as plt + +import cpmpy as cp + +class JSPLibDataset(object): # torch.utils.data.Dataset compatible + + """ + JSP Dataset in a PyTorch compatible format. + + More information on JSPLib can be found here: https://github.com/tamy0612/JSPLIB + """ + + def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False): + """ + Initialize the PSPLib Dataset. + + Arguments: + root (str): Root directory containing the jsp instances (if 'download', instances will be downloaded to this location) + transform (callable, optional): Optional transform to be applied on the instance data + target_transform (callable, optional): Optional transform to be applied on the file path + download (bool): If True, downloads the dataset from the internet and puts it in `root` directory + """ + + self.root = pathlib.Path(root) + self.instance_dir = pathlib.Path(join(self.root, "jsplib")) + self.metadata_file = "instances.json" + self.transform = transform + self.target_transform = target_transform + + # Create root directory if it doesn't exist + self.root.mkdir(parents=True, exist_ok=True) + + print(self.instance_dir, self.instance_dir.exists(), self.instance_dir.is_dir()) + if not self.instance_dir.exists(): + if not download: + raise ValueError(f"Dataset not found in local file system. Please set download=True to download the dataset.") + else: + url = f"https://github.com/tamy0612/JSPLIB/archive/refs/heads/master.zip" # download full repo... + url_path = url + zip_path = pathlib.Path(join(root,"jsplib-master.zip")) + + print(f"Downloading JSPLib instances..") + + try: + urlretrieve(url_path, str(zip_path)) + except (HTTPError, URLError) as e: + raise ValueError(f"No dataset available on {url}. Error: {str(e)}") + + # make directory and extract files + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + self.instance_dir.mkdir(parents=True, exist_ok=True) + + # Extract files + for file_info in zip_ref.infolist(): + if file_info.filename.startswith("JSPLIB-master/instances/") and file_info.file_size > 0: + filename = pathlib.Path(file_info.filename).name + with zip_ref.open(file_info) as source, open(self.instance_dir / filename, 'wb') as target: + target.write(source.read()) + # extract metadata file + with zip_ref.open("JSPLIB-master/instances.json") as source, open(self.instance_dir / self.metadata_file, 'wb') as target: + target.write(source.read()) + # Clean up the zip file + zip_path.unlink() + + + def __len__(self) -> int: + """Return the total number of instances.""" + return len(list(self.instance_dir.glob("*"))) + + def __getitem__(self, index: int|str) -> Tuple[Any, Any]: + """ + Get a single JSPLib instance filename and metadata. + + Args: + index (int or str): Index or name of the instance to retrieve + + Returns: + Tuple[Any, Any]: A tuple containing: + - The filename of the instance + - Metadata dictionary with file name, track, year etc. + """ + if isinstance(index, int) and (index < 0 or index >= len(self)): + raise IndexError("Index out of range") + + # Get all instance files and sort for deterministic behavior # TODO: use natsort instead? + files = sorted(list(self.instance_dir.glob("*[!.json]"))) # exclude metadata file + if isinstance(index, int): + file_path = files[index] + elif isinstance(index, str): + for file_path in files: + if file_path.stem == index: + break + else: + raise IndexError(f"Instance {index} not found in dataset") + + filename = str(file_path) + if self.transform: + # does not need to remain a filename... + filename = self.transform(filename) + + with open(self.instance_dir / self.metadata_file, "r") as f: + for entry in json.load(f): + if entry["name"] == file_path.stem: + metadata = entry + metadata['path'] = str(file_path) + break + else: + metadata = dict() + + if self.target_transform: + metadata = self.target_transform(metadata) + + return filename, metadata + + def open(self, instance: os.PathLike) -> callable: + return open(instance, "r") + + +def parse_jsp(filename: str): + """ + Parse a JSPLib instance file + Returns two matrices: + - task to machines indicating on which machine to run which task + - task durations: indicating the duration of each task + """ + + with open(filename, "r") as f: + line = f.readline() + while line.startswith("#"): + line = f.readline() + n_jobs, n_tasks = map(int, line.strip().split(" ")) + matrix = np.fromstring(f.read(), sep=" ", dtype=int).reshape((n_jobs, n_tasks*2)) + + task_to_machines = np.empty(dtype=int, shape=(n_jobs, n_tasks)) + task_durations = np.empty(dtype=int, shape=(n_jobs, n_tasks)) + + for t in range(n_tasks): + task_to_machines[:, t] = matrix[:, t*2] + task_durations[:, t] = matrix[:, t*2+1] + + return task_to_machines, task_durations + +def jobshop_model(task_to_machines, task_durations): + + + task_to_machines = np.array(task_to_machines) + dur = np.array(task_durations) + + assert task_to_machines.shape == task_durations.shape + + n_jobs, n_tasks = task_to_machines.shape + + start = cp.intvar(0, task_durations.sum(), name="start", shape=(n_jobs,n_tasks)) # extremely bad upperbound... TODO + end = cp.intvar(0, task_durations.sum(), name="end", shape=(n_jobs,n_tasks)) # extremely bad upperbound... TODO + makespan = cp.intvar(0, task_durations.sum(), name="makespan") # extremely bad upperbound... TODO + + model = cp.Model() + model += start + dur == end + model += end[:,:-1] <= start[:,1:] # precedences + + for machine in set(task_to_machines.flat): + model += cp.NoOverlap(start[task_to_machines == machine], + dur[task_to_machines == machine], + end[task_to_machines == machine]) + + model += end <= makespan + model.minimize(makespan) + + return model, (start, makespan) + + +if __name__ == "__main__": + + dataset = JSPLibDataset(root=".", download=True, transform=parse_jsp) + print("Dataset size:", len(dataset)) + print("Instance 0:") + (machines, dur), metadata = dataset[0] + print("Machines:", machines) + print("Durations:", dur) + print("Metadata:", metadata) + + print("Solving", metadata['name']) + model, (start, makespan) = jobshop_model(task_to_machines=machines, task_durations=dur) + assert model.solve(time_limit=10) + + import pandas as pd + import plotly.express as px + import plotly.io as pio + pio.renderers.default = "browser" # ensure plotly opens figure in browser + + df = pd.DataFrame({"Start": start.value().flat, "Duration": dur.flat, "Machine": machines.flat}) + df["Job"] = [j for j in range(metadata['jobs']) for _ in range(metadata['machines']) ] + df["Task"] = [j for _ in range(metadata['machines']) for j in range(metadata['jobs'])] + df["Name"] = "T" + df["Job"].astype(str) + "-" + df["Task"].astype(str) + print(df) + ghant_fig = px.bar(df, orientation='h', + base="Start", x="Duration", y="Machine", color="Job", text="Name", + title=f"Jobshop instance {metadata['name']}, makespan: {makespan.value()}, status: {model.status()}" + ) + ghant_fig.show() \ No newline at end of file diff --git a/cpmpy/tools/jsplib/__init__.py b/cpmpy/tools/jsplib/__init__.py new file mode 100644 index 000000000..6ebdec377 --- /dev/null +++ b/cpmpy/tools/jsplib/__init__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## __init__.py +## +""" +Set of utilities for working with JSPLib-formatted CP models. + + +================== +List of submodules +================== + +.. autosummary:: + :nosignatures: + + parser +""" + +from .parser import read_jsplib diff --git a/cpmpy/tools/jsplib/parser.py b/cpmpy/tools/jsplib/parser.py new file mode 100644 index 000000000..11c820faa --- /dev/null +++ b/cpmpy/tools/jsplib/parser.py @@ -0,0 +1,148 @@ +""" +Parser for the JSPLib format. + + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read_jsplib +""" + + +import os +import sys +import lzma +import argparse +import cpmpy as cp +import numpy as np +from io import StringIO +from typing import Union + + +_std_open = open +def read_jsplib(jsp: Union[str, os.PathLike], open=open) -> cp.Model: + """ + Parser for JSPLib format. Reads in an instance and returns its matching CPMpy model. + + Arguments: + jsp (str or os.PathLike): + - A file path to a JSPlib file + - OR a string containing the JSPLib content directly + open: (callable): + If jsp is the path to a file, a callable to "open" that file (default=python standard library's 'open'). + + Returns: + cp.Model: The CPMpy model of the JSPLib instance. + """ + # If rcpsp is a path to a file -> open file + if isinstance(jsp, (str, os.PathLike)) and os.path.exists(jsp): + if open is not None: + f = open(jsp) + else: + f = _std_open(jsp, "rt") + # If rcpsp is a string containing a model -> create a memory-mapped file + else: + f = StringIO(jsp) + + + task_to_machines, task_durations = _parse_jsplib(f) + model, (start, makespan) = _model_jsplib(task_to_machines=task_to_machines, task_durations=task_durations) + return model + + +def _parse_jsplib(f): + """ + Parse a JSPLib instance file + Returns two matrices: + - task to machines indicating on which machine to run which task + - task durations: indicating the duration of each task + """ + + line = f.readline() + while line.startswith("#"): + line = f.readline() + n_jobs, n_tasks = map(int, line.strip().split(" ")) + matrix = np.fromstring(f.read(), sep=" ", dtype=int).reshape((n_jobs, n_tasks*2)) + + task_to_machines = np.empty(dtype=int, shape=(n_jobs, n_tasks)) + task_durations = np.empty(dtype=int, shape=(n_jobs, n_tasks)) + + for t in range(n_tasks): + task_to_machines[:, t] = matrix[:, t*2] + task_durations[:, t] = matrix[:, t*2+1] + + return task_to_machines, task_durations + + + +def _model_jsplib(task_to_machines, task_durations): + + task_to_machines = np.array(task_to_machines) + dur = np.array(task_durations) + + assert task_to_machines.shape == task_durations.shape + + n_jobs, n_tasks = task_to_machines.shape + + start = cp.intvar(0, task_durations.sum(), name="start", shape=(n_jobs,n_tasks)) # extremely bad upperbound... TODO + end = cp.intvar(0, task_durations.sum(), name="end", shape=(n_jobs,n_tasks)) # extremely bad upperbound... TODO + makespan = cp.intvar(0, task_durations.sum(), name="makespan") # extremely bad upperbound... TODO + + model = cp.Model() + model += start + dur == end + model += end[:,:-1] <= start[:,1:] # precedences + + for machine in set(task_to_machines.flat): + model += cp.NoOverlap(start[task_to_machines == machine], + dur[task_to_machines == machine], + end[task_to_machines == machine]) + + model += end <= makespan + model.minimize(makespan) + + return model, (start, makespan) + + + +def main(): + parser = argparse.ArgumentParser(description="Parse and solve a JSPLib model using CPMpy") + parser.add_argument("model", help="Path to a JSPLib file (or raw RCPSP string if --string is given)") + parser.add_argument("-s", "--solver", default=None, help="Solver name to use (default: CPMpy's default)") + parser.add_argument("--string", action="store_true", help="Interpret the first argument (model) as a raw JSPLib string instead of a file path") + parser.add_argument("-t", "--time-limit", type=int, default=None, help="Time limit for the solver in seconds (default: no limit)") + args = parser.parse_args() + + # Build the CPMpy model + try: + if args.string: + model = read_jsplib(args.model) + else: + model = read_jsplib(os.path.expanduser(args.model)) + except Exception as e: + sys.stderr.write(f"Error reading model: {e}\n") + sys.exit(1) + + # Solve the model + try: + if args.solver: + result = model.solve(solver=args.solver, time_limit=args.time_limit) + else: + result = model.solve(time_limit=args.time_limit) + except Exception as e: + sys.stderr.write(f"Error solving model: {e}\n") + sys.exit(1) + + # Print results + print("Status:", model.status()) + if result is not None: + if model.has_objective(): + print("Objective:", model.objective_value()) + else: + print("No solution found.") + +if __name__ == "__main__": + main() \ No newline at end of file From 9098299f5ecc2f986e622abb1fb3b3135595b571 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 24 Oct 2025 16:04:42 +0200 Subject: [PATCH 038/152] Add bounds for all jsplib instances --- cpmpy/tools/dataset/problem/jsplib.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpmpy/tools/dataset/problem/jsplib.py b/cpmpy/tools/dataset/problem/jsplib.py index 54cba2890..d110c48db 100644 --- a/cpmpy/tools/dataset/problem/jsplib.py +++ b/cpmpy/tools/dataset/problem/jsplib.py @@ -120,6 +120,9 @@ def __getitem__(self, index: int|str) -> Tuple[Any, Any]: for entry in json.load(f): if entry["name"] == file_path.stem: metadata = entry + if "bounds" not in metadata: + metadata["bounds"] = {"upper": metadata["optimum"], "lower": metadata["optimum"]} + del metadata['path'] metadata['path'] = str(file_path) break else: From 658967d3951e2dc0e7219c755a6b7fc994271399 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sat, 25 Oct 2025 10:48:47 +0200 Subject: [PATCH 039/152] Fix choco args --- cpmpy/tools/benchmark/_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/benchmark/_base.py b/cpmpy/tools/benchmark/_base.py index 11d17ed42..3f9257d49 100644 --- a/cpmpy/tools/benchmark/_base.py +++ b/cpmpy/tools/benchmark/_base.py @@ -34,6 +34,7 @@ import signal import sys import time +import math import random import psutil import warnings @@ -191,7 +192,7 @@ def exact_arguments( return res, None - def choco_arguments(): + def choco_arguments(self): # Documentation: https://github.com/chocoteam/pychoco/blob/master/pychoco/solver.py return {}, None From 38db2906c84fe70ee0c979b0defedfcae94b4c0e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sat, 25 Oct 2025 18:16:40 +0200 Subject: [PATCH 040/152] Fixes --- cpmpy/tools/benchmark/jsplib.py | 6 +++++- cpmpy/tools/benchmark/psplib.py | 6 +++++- cpmpy/tools/benchmark/xcsp3.py | 10 +++++----- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/cpmpy/tools/benchmark/jsplib.py b/cpmpy/tools/benchmark/jsplib.py index 30c99da79..e9dacb7ce 100644 --- a/cpmpy/tools/benchmark/jsplib.py +++ b/cpmpy/tools/benchmark/jsplib.py @@ -74,6 +74,7 @@ class JSPLibBenchmark(Benchmark): """ def __init__(self): + self.sol_time = None super().__init__(reader=read_jsplib) def print_comment(self, comment:str): @@ -93,9 +94,11 @@ def print_intermediate(self, objective:int): def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_objective(s.objective_value()) self.print_value(solution_psplib(s)) self.print_status(ExitStatus.optimal) elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_objective(s.objective_value()) self.print_value(solution_psplib(s)) self.print_status(ExitStatus.sat) elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: @@ -153,7 +156,8 @@ def parse_output_line(self, line, result): obj = int(line[2:].strip()) if result['intermediate'] is None: result['intermediate'] = [] - result['intermediate'] += [(self.sol_time, obj)] + if self.sol_time is not None: + result['intermediate'] += [(self.sol_time, obj)] result['objective_value'] = obj obj = None elif line.startswith('c took '): diff --git a/cpmpy/tools/benchmark/psplib.py b/cpmpy/tools/benchmark/psplib.py index 26046cf84..0f1a1639f 100644 --- a/cpmpy/tools/benchmark/psplib.py +++ b/cpmpy/tools/benchmark/psplib.py @@ -75,6 +75,7 @@ class PSPLIBBenchmark(Benchmark): """ def __init__(self): + self.sol_time = None super().__init__(reader=read_rcpsp) # TODO: reader should depend on problem variant def print_comment(self, comment:str): @@ -94,9 +95,11 @@ def print_intermediate(self, objective:int): def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_objective(s.objective_value()) self.print_value(solution_psplib(s)) self.print_status(ExitStatus.optimal) elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_objective(s.objective_value()) self.print_value(solution_psplib(s)) self.print_status(ExitStatus.sat) elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: @@ -154,7 +157,8 @@ def parse_output_line(self, line, result): obj = int(line[2:].strip()) if result['intermediate'] is None: result['intermediate'] = [] - result['intermediate'] += [(self.sol_time, obj)] + if self.sol_time is not None: + result['intermediate'] += [(self.sol_time, obj)] result['objective_value'] = obj obj = None elif line.startswith('c took '): diff --git a/cpmpy/tools/benchmark/xcsp3.py b/cpmpy/tools/benchmark/xcsp3.py index 47d0289e4..176d42d18 100644 --- a/cpmpy/tools/benchmark/xcsp3.py +++ b/cpmpy/tools/benchmark/xcsp3.py @@ -182,17 +182,17 @@ def parse_output_line(self, line, result): complete_solution = line if "cost" in solution: result['objective_value'] = solution.split('cost="')[-1][:-2] + elif line.startswith('c Solution'): + parts = line.split(', time = ') + # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines + self._sol_time = float(parts[-1].replace('s', '').rstrip()) elif line.startswith('o '): obj = int(line[2:].strip()) if result['intermediate'] is None: result['intermediate'] = [] - result['intermediate'] += [(sol_time, obj)] + result['intermediate'] += [(self._sol_time, obj)] result['objective_value'] = obj obj = None - elif line.startswith('c Solution'): - parts = line.split(', time = ') - # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines - sol_time = float(parts[-1].replace('s', '').rstrip()) elif line.startswith('c took '): # Parse timing information parts = line.split(' seconds to ') From 62b605d99109fa9bafd4b16ca92a0bb812963121 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Mon, 3 Nov 2025 17:08:37 +0100 Subject: [PATCH 041/152] correct jsplib output file name --- cpmpy/tools/benchmark/jsplib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpmpy/tools/benchmark/jsplib.py b/cpmpy/tools/benchmark/jsplib.py index e9dacb7ce..343c2dfdd 100644 --- a/cpmpy/tools/benchmark/jsplib.py +++ b/cpmpy/tools/benchmark/jsplib.py @@ -205,7 +205,7 @@ def parse_output_line(self, line, result): timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Define output file path with timestamp - output_file = str(output_dir / "jsplib" / f"psplib_{args.solver}_{timestamp}.csv") + output_file = str(output_dir / "jsplib" / f"jsplib_{args.solver}_{timestamp}.csv") # Run the benchmark instance_runner = JSPLibBenchmark() From ddf69389644bd69c9efa2cc06090a0d399dbef0c Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Mon, 3 Nov 2025 17:09:02 +0100 Subject: [PATCH 042/152] remove matplotlib import --- cpmpy/tools/dataset/problem/jsplib.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpmpy/tools/dataset/problem/jsplib.py b/cpmpy/tools/dataset/problem/jsplib.py index d110c48db..17453fe32 100644 --- a/cpmpy/tools/dataset/problem/jsplib.py +++ b/cpmpy/tools/dataset/problem/jsplib.py @@ -16,8 +16,6 @@ import zipfile import numpy as np -from matplotlib import pyplot as plt - import cpmpy as cp class JSPLibDataset(object): # torch.utils.data.Dataset compatible From 344aaafd94fef5765b6d7baa22e111ad204cf7c4 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Mon, 3 Nov 2025 17:09:23 +0100 Subject: [PATCH 043/152] xcsp3 track intermediate sol time --- cpmpy/tools/benchmark/xcsp3.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/benchmark/xcsp3.py b/cpmpy/tools/benchmark/xcsp3.py index 176d42d18..1bc70ad9b 100644 --- a/cpmpy/tools/benchmark/xcsp3.py +++ b/cpmpy/tools/benchmark/xcsp3.py @@ -121,6 +121,7 @@ class XCSP3Benchmark(Benchmark): """ def __init__(self): + self._sol_time = None super().__init__(reader=read_xcsp3, exit_status=XCSP3ExitStatus) def print_comment(self, comment:str): @@ -190,7 +191,8 @@ def parse_output_line(self, line, result): obj = int(line[2:].strip()) if result['intermediate'] is None: result['intermediate'] = [] - result['intermediate'] += [(self._sol_time, obj)] + if self._sol_time is not None: + result['intermediate'] += [(self._sol_time, obj)] result['objective_value'] = obj obj = None elif line.startswith('c took '): From 7cd1bb1630345303c42d49a2dc328572a7e6ad6a Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Mon, 3 Nov 2025 17:09:41 +0100 Subject: [PATCH 044/152] opb print intermediate solutions --- cpmpy/tools/benchmark/opb.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/benchmark/opb.py b/cpmpy/tools/benchmark/opb.py index 5c0c222cd..0c571a058 100644 --- a/cpmpy/tools/benchmark/opb.py +++ b/cpmpy/tools/benchmark/opb.py @@ -83,6 +83,7 @@ class OPBBenchmark(Benchmark): """ def __init__(self): + self.sol_time = None super().__init__(reader=read_opb, exit_status=OPBExitStatus) def print_comment(self, comment:str): @@ -103,9 +104,11 @@ def print_intermediate(self, objective:int): def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_objective(s.objective_value()) self.print_value(solution_opb(s)) self.print_status(OPBExitStatus.optimal) elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_objective(s.objective_value()) self.print_value(solution_opb(s)) self.print_status(OPBExitStatus.sat) elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: @@ -162,7 +165,8 @@ def parse_output_line(self, line, result): obj = int(line[2:].strip()) if result['intermediate'] is None: result['intermediate'] = [] - result['intermediate'] += [(self.sol_time, obj)] + if self.sol_time is not None: + result['intermediate'] += [(self.sol_time, obj)] result['objective_value'] = obj obj = None elif line.startswith('c took '): From a21a0404c22463fae25e945e1cd4f5c688cfad80 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Mon, 3 Nov 2025 17:09:54 +0100 Subject: [PATCH 045/152] mse print intermediate solutions --- cpmpy/tools/benchmark/mse.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/benchmark/mse.py b/cpmpy/tools/benchmark/mse.py index 656467bf9..a1936346e 100644 --- a/cpmpy/tools/benchmark/mse.py +++ b/cpmpy/tools/benchmark/mse.py @@ -90,6 +90,7 @@ class MSEBenchmark(Benchmark): """ def __init__(self): + self._sol_time = None super().__init__(reader=read_wcnf, exit_status=MSEExitStatus) def print_comment(self, comment:str): @@ -109,9 +110,11 @@ def print_intermediate(self, objective:int): def print_result(self, s): if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_objective(s.objective_value()) self.print_value(solution_mse(s)) self.print_status(MSEExitStatus.optimal) elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_objective(s.objective_value()) self.print_value(solution_mse(s)) self.print_status(MSEExitStatus.sat) elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: @@ -169,7 +172,8 @@ def parse_output_line(self, line, result): obj = int(line[2:].strip()) if result['intermediate'] is None: result['intermediate'] = [] - result['intermediate'] += [(self._sol_time, obj)] + if self._sol_time is not None: + result['intermediate'] += [(self._sol_time, obj)] result['objective_value'] = obj obj = None elif line.startswith('c took '): From eda839c8b9df7df12c238a9e16a4bde6494fa8aa Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Mon, 3 Nov 2025 17:13:07 +0100 Subject: [PATCH 046/152] cplex and hexaly solver arguments --- cpmpy/tools/benchmark/_base.py | 93 ++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/cpmpy/tools/benchmark/_base.py b/cpmpy/tools/benchmark/_base.py index 4de8ce816..ce103dcab 100644 --- a/cpmpy/tools/benchmark/_base.py +++ b/cpmpy/tools/benchmark/_base.py @@ -338,7 +338,96 @@ def solution_count(self): res |= { "solution_callback": CpoSolutionCallback } return res, None + + def cplex_arguments( + self, + cores: Optional[int] = None, + seed: Optional[int] = None, + **kwargs + ): + res = dict() + if cores is not None: + res |= {"threads": cores} + if seed is not None: + res |= {"randomseed": seed} + + return res, None + + def hexaly_arguments( + self, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + res = dict() + #res |= {"nb_threads": cores} + #res |= {"seed": seed} + + + if intermediate and model.has_objective(): + # Define custom Hexaly solution callback, then register it + + _self = self + class HexSolutionCallback: + def __init__(self): + self.__start_time = time.time() + self.__solution_count = 0 + + + def on_solution_callback(self, optimizer, cb_type): + """Called on each new solution.""" + # check if solution with different objective (or if verbose) + current_time = time.time() + obj = optimizer.model.objectives[0] + _self.print_comment('Solution %i, time = %0.4fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.print_intermediate(obj) + self.__solution_count += 1 + + def solution_count(self): + return self.__solution_count + + # Register the callback + res |= { "solution_callback": HexSolutionCallback().on_solution_callback } + + + # def internal_options(solver: "CPM_hexaly"): + # # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 + # #solver.native_model.get_param().set_seed(seed) + # #solver.native_model.get_param().set_nr_threads(cores) + + # _self = self + # class CallbackExample: + # def __init__(self): + # self.last_best_value = 0 + # self.last_best_running_time = 0 + # self.__solution_count = 0 + # self.__start_time = time.time() + + # def my_callback(self, optimizer, cb_type): + # stats = optimizer.statistics + # obj = optimizer.model.objectives[0] + # current_time = time.time() + # #obj = int(self.ObjectiveValue()) + # #obj = optimizer.get_objective_bound(0).value + # if obj.value > self.last_best_value: + # self.last_best_running_time = stats.running_time + # self.last_best_value = obj.value + # self.__solution_count += 1 + + # _self.print_comment('Solution %i, time = %0.4fs' % + # (self.__solution_count, current_time - self.__start_time)) + # _self.print_intermediate(obj.value) + + # optimizer = solver.native_model + # cb = CallbackExample() + # from hexaly.optimizer import HxCallbackType + # optimizer.add_callback(HxCallbackType.TIME_TICKED, cb.my_callback) + + return res, None """ Methods which can, bit most likely shouldn't, be overwritten. @@ -420,6 +509,10 @@ def solver_arguments( return self.gurobi_arguments(model, cores=cores, seed=seed, mem_limit=mem_limit, intermediate=intermediate, opt=opt, **kwargs) elif solver == "cpo": return self.cpo_arguments(model=model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) + elif solver == "hexaly": + return self.hexaly_arguments(model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) + elif solver == "cplex": + return self.cplex_arguments(cores=cores, **kwargs) else: self.print_comment(f"setting parameters of {solver} is not (yet) supported") return dict(), None From 2004cfeacadfe8d79d57d8a8dbe2e45826dda97d Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 6 Jan 2026 14:17:03 +0100 Subject: [PATCH 047/152] Add nurse rostering dataset --- cpmpy/tools/dataset/problem/nurserostering.py | 544 ++++++++++++++++++ examples/nurserostering.py | 376 ++++-------- 2 files changed, 648 insertions(+), 272 deletions(-) create mode 100644 cpmpy/tools/dataset/problem/nurserostering.py diff --git a/cpmpy/tools/dataset/problem/nurserostering.py b/cpmpy/tools/dataset/problem/nurserostering.py new file mode 100644 index 000000000..f9f3c0c61 --- /dev/null +++ b/cpmpy/tools/dataset/problem/nurserostering.py @@ -0,0 +1,544 @@ +""" +PyTorch-style Dataset for Nurserostering instances from schedulingbenchmarks.org + +Simply create a dataset instance and start iterating over its contents: +The `metadata` contains usefull information about the current problem instance. + +https://schedulingbenchmarks.org/nrp/ +""" +import os +import pathlib +from typing import Tuple, Any +from urllib.request import urlretrieve +from urllib.error import HTTPError, URLError +import zipfile +import re + +import cpmpy as cp + +# Optional dependencies +try: + import pandas as pd + _HAS_PANDAS = True +except ImportError: + _HAS_PANDAS = False + +try: + from faker import Faker + _HAS_FAKER = True +except ImportError: + _HAS_FAKER = False + + +class NurseRosteringDataset(object): # torch.utils.data.Dataset compatible + + """ + Nurserostering Dataset in a PyTorch compatible format. + + More information on nurserostering instances can be found here: https://schedulingbenchmarks.org/nrp/ + """ + + def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False, sort_key=None): + """ + Initialize the Nurserostering Dataset. + + Arguments: + root (str): Root directory containing the nurserostering instances (if 'download', instances will be downloaded to this location) + transform (callable, optional): Optional transform to be applied on the instance data + target_transform (callable, optional): Optional transform to be applied on the file path + download (bool): If True, downloads the dataset from the internet and puts it in `root` directory + sort_key (callable, optional): Optional function to sort instance files. If None, uses Python's built-in sorted(). + For natural/numeric sorting, pass natsorted from natsort library. + Example: from natsort import natsorted; dataset = NurseRosteringDataset(..., sort_key=natsorted) + """ + + self.root = pathlib.Path(root) + self.instance_dir = pathlib.Path(os.path.join(self.root, "nurserostering")) + self.transform = transform + self.target_transform = target_transform + self.sort_key = sorted if sort_key is None else sort_key + + # Create root directory if it doesn't exist + self.root.mkdir(parents=True, exist_ok=True) + + if not self.instance_dir.exists(): + if not download: + raise ValueError(f"Dataset not found in local file system. Please set download=True to download the dataset.") + else: + url = f"https://schedulingbenchmarks.org/nrp/data/instances1_24.zip" # download full repo... + zip_path = pathlib.Path(os.path.join(root,"jsplib-master.zip")) + + print(f"Downloading Nurserostering instances from schedulingbenchmarks.org") + + try: + urlretrieve(url, str(zip_path)) + except (HTTPError, URLError) as e: + raise ValueError(f"No dataset available on {url}. Error: {str(e)}") + + # make directory and extract files + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + self.instance_dir.mkdir(parents=True, exist_ok=True) + + # Extract files + for file_info in zip_ref.infolist(): + filename = pathlib.Path(file_info.filename).name + with zip_ref.open(file_info) as source, open(self.instance_dir / filename, 'wb') as target: + target.write(source.read()) + + # Clean up the zip file + zip_path.unlink() + + + def __len__(self) -> int: + """Return the total number of instances.""" + return len(list(self.instance_dir.glob("*.txt"))) + + def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Get a single Nurserostering instance filename and metadata. + + Args: + index (int): Index of the instance to retrieve + + Returns: + Tuple[Any, Any]: A tuple containing: + - The filename of the instance + - Metadata dictionary with file name, track, year etc. + """ + if isinstance(index, int) and not (0 <= index < len(self)): + raise IndexError("Index out of range") + + # Get all instance files and sort for deterministic behavior + files = self.sort_key(list(self.instance_dir.glob("*.txt"))) # use .txt files instead of xml files + file_path = files[index] + + filename = str(file_path) + if self.transform: + # user might want to process the filename to something else + filename = self.transform(filename) + + metadata = dict(name=file_path.stem) + + if self.target_transform: + metadata = self.target_transform(metadata) + + return filename, metadata + + def open(self, instance: os.PathLike) -> callable: + return open(instance, "r") + + +def _tag_to_data(string, tag, skip_lines=0, datatype=None, names=None, dtype=None): + """ + Extract data from a tagged section in the input string. + + Args: + string: Input string containing tagged sections + tag: Tag name to search for (e.g., "SECTION_SHIFTS") + skip_lines: Number of lines to skip after the tag + datatype: Type hint for return value. If None, returns list of dicts (CSV rows). + If int, str, etc., returns that type parsed from first line. + names: Optional list of column names to rename headers to. If provided, must match + the number of columns or be shorter (extra columns will keep original names). + dtype: Optional dict mapping column names to data types for conversion. + Example: {'Length': int, 'ShiftID': str} + + Returns: + If datatype is None: list of dicts (CSV rows as dictionaries) + If datatype is int, str, etc.: parsed value from first line + """ + regex = rf'{tag}[\s\S]*?($|(?=\n\s*\n))' + match = re.search(regex, string) + + if not match: + return None + + lines = list(match.group().split("\n")[skip_lines+1:]) + if not lines: + return None + + # If datatype is a simple type (int, str, etc.), parse accordingly + if datatype is not None and datatype not in (list, dict): + if datatype is int or datatype is float: + # For numeric types, return first line + first_line = lines[0].strip() + return datatype(first_line) if first_line else None + elif datatype is str: + # For string type, return the whole data section + return "\n".join(lines).strip() + + # Parse header + headers = lines[0].split(",") + # Clean headers: remove # and strip whitespace, but keep exact names + headers = [h.replace("#", "").strip() for h in headers] + + # Rename columns if names provided + if names is not None: + for i, new_name in enumerate(names): + if i < len(headers): + headers[i] = new_name + + # Parse data rows + rows = [] + for line in lines[1:]: + if not line.strip(): + continue + values = line.split(",") + # Pad values if needed + while len(values) < len(headers): + values.append("") + row = {} + for i in range(len(headers)): + value = values[i].strip() if i < len(values) else "" + col_name = headers[i] + + # Apply type conversion if dtype specified + if dtype is not None and col_name in dtype: + target_type = dtype[col_name] + row[col_name] = target_type(value) if value else None + else: + row[col_name] = value + rows.append(row) + + return rows + +def parse_scheduling_period(filename: str): + """ + Parse a nurserostering instance file. + + Returns a dictionary with native Python data structures (lists of dicts). + Use to_dataframes() transform to convert to pandas DataFrames if needed. + Use add_fake_names() transform to add randomly generated names to staff. + """ + with open(filename, "r") as f: + string = f.read() + + # Parse scheduling horizon + horizon = int(_tag_to_data(string, "SECTION_HORIZON", skip_lines=2, datatype=int)) + + # Parse shifts - list of dicts with ShiftID as key + shifts_rows = _tag_to_data(string, "SECTION_SHIFTS", + names=["ShiftID", "Length", "cannot follow"], + dtype={'ShiftID': str, 'Length': int, 'cannot follow': str}) + shifts = {} + for row in shifts_rows: + cannot_follow_str = row.get("cannot follow") or "" + shifts[row["ShiftID"]] = { + "Length": row["Length"], + "cannot follow": [v.strip() for v in cannot_follow_str.split("|") if v.strip()] + } + + # Parse staff - list of dicts + staff = _tag_to_data(string, "SECTION_STAFF", + names=["ID", "MaxShifts", "MaxTotalMinutes", "MinTotalMinutes", "MaxConsecutiveShifts", "MinConsecutiveShifts", "MinConsecutiveDaysOff", "MaxWeekends"], + dtype={'MaxShifts': str, 'MaxTotalMinutes': int, 'MinTotalMinutes': int, 'MaxConsecutiveShifts': int, 'MinConsecutiveShifts': int, 'MinConsecutiveDaysOff': int, 'MaxWeekends': int}) + + # Process MaxShifts column - split by | and create max_shifts_* columns + for idx, nurse in enumerate(staff): + max_shifts_str = nurse.get("MaxShifts", "").strip() + if max_shifts_str: + max_shift_parts = max_shifts_str.split("|") + for part in max_shift_parts: + if "=" in part: + shift_id, max_val = part.split("=", 1) + shift_id = shift_id.strip() + max_val = max_val.strip() + if shift_id and max_val: + nurse[f"max_shifts_{shift_id}"] = int(max_val) + + # Parse days off - this section has variable columns (EmployeeID + N day indices) + # Parse as raw string since column count varies per row + days_off_raw = _tag_to_data(string, "SECTION_DAYS_OFF", datatype=str) + days_off = [] + if days_off_raw: + for line in days_off_raw.split("\n"): + line = line.strip() + if not line or line.startswith("#") or line.lower().startswith("employeeid"): + continue + # Parse CSV-style line (handles variable number of columns) + parts = line.split(",") + if len(parts) > 0: + employee_id = parts[0].strip() + # Remaining parts are day indices + for day_str in parts[1:]: + day_str = day_str.strip() + if day_str and day_str.isdigit(): + day_idx = int(day_str) + if 0 <= day_idx < horizon: + days_off.append({"EmployeeID": employee_id, "DayIndex": day_idx}) + + # Parse shift requests + shift_on = _tag_to_data(string, "SECTION_SHIFT_ON_REQUESTS", + names=["EmployeeID", "Day", "ShiftID", "Weight"], + dtype={'Weight': int, "Day": int, "ShiftID": str}) + shift_off = _tag_to_data(string, "SECTION_SHIFT_OFF_REQUESTS", + names=["EmployeeID", "Day", "ShiftID", "Weight"], + dtype={'Weight': int, "Day": int, "ShiftID": str}) + cover = _tag_to_data(string, "SECTION_COVER", + names=["Day", "ShiftID", "Requirement", "Weight for under", "Weight for over"], + dtype={'Day': int, 'ShiftID': str, 'Requirement': int, 'Weight for under': int, 'Weight for over': int}) + + return dict(horizon=horizon, shifts=shifts, staff=staff, days_off=days_off, + shift_on=shift_on, shift_off=shift_off, cover=cover) + + +def add_fake_names(data, seed=0): + """ + Transform function to add randomly generated names to staff using Faker. + + This function can be used as a transform argument to NurseRosteringDataset + to add fake names to the parsed data. + + Example: + dataset = NurseRosteringDataset( + root=".", + transform=lambda fname: add_fake_names(parse_scheduling_period(fname)) + ) + + Or combine with other transforms: + dataset = NurseRosteringDataset( + root=".", + transform=lambda fname: to_dataframes( + add_fake_names(parse_scheduling_period(fname)) + ) + ) + + Args: + data: Dictionary returned by parse_scheduling_period() + seed: Random seed for reproducible name generation (default: 0) + + Returns: + Dictionary with 'name' field added to each staff member + + Raises: + ImportError: If Faker is not installed + """ + if not _HAS_FAKER: + raise ImportError("Faker is required for add_fake_names(). Install it with: pip install faker") + + fake = Faker() + fake.seed_instance(seed) + + # Add names to staff + for idx, nurse in enumerate(data["staff"]): + nurse["name"] = fake.unique.first_name() + + return data + + +def to_dataframes(data): + """ + Transform function to convert native data structures to pandas DataFrames. + + This function can be used as a transform argument to NurseRosteringDataset + to convert the parsed data into pandas DataFrames for easier manipulation. + + Example: + dataset = NurseRosteringDataset( + root=".", + transform=lambda fname: to_dataframes(parse_scheduling_period(fname)) + ) + + Args: + data: Dictionary returned by parse_scheduling_period() + + Returns: + Dictionary with pandas DataFrames instead of native structures + + Raises: + ImportError: If pandas is not installed + """ + if not _HAS_PANDAS: + raise ImportError("pandas is required for to_dataframes(). Install it with: pip install pandas") + + result = {"horizon": data["horizon"]} + + # Convert shifts dict to DataFrame + shifts_rows = [] + for shift_id, shift_data in data["shifts"].items(): + row = {"ShiftID": shift_id, "Length": shift_data["Length"], + "cannot follow": "|".join(shift_data["cannot follow"])} + shifts_rows.append(row) + result["shifts"] = pd.DataFrame(shifts_rows).set_index("ShiftID") + + # Convert staff list to DataFrame + result["staff"] = pd.DataFrame(data["staff"]).set_index("ID") + + # Convert days_off list to DataFrame + result["days_off"] = pd.DataFrame(data["days_off"]) + + # Convert shift_on, shift_off, cover lists to DataFrames + result["shift_on"] = pd.DataFrame(data["shift_on"]) + result["shift_off"] = pd.DataFrame(data["shift_off"]) + result["cover"] = pd.DataFrame(data["cover"]) + + return result + + +def nurserostering_model(horizon, shifts, staff, days_off, shift_on, shift_off, cover): + """ + Create a CPMpy model for nurserostering. + + Args: + horizon: Number of days in the scheduling period + shifts: Dict mapping shift_id to dict with shift data + staff: List of dicts, each representing a nurse with their constraints + days_off: List of dicts with days off for each nurse + shift_on: List of dicts with shift-on requests for each nurse + shift_off: List of dicts with shift-off requests for each nurse + cover: List of dicts with cover requirements for each day and shift + """ + n_nurses = len(staff) + + FREE = 0 + shift_ids = list(shifts.keys()) + SHIFTS = ["F"] + shift_ids + + nurse_view = cp.intvar(0, len(shifts), shape=(n_nurses, horizon), name="nv") + + model = cp.Model() + + # Shifts which cannot follow the shift on the previous day. + for shift_id, shift_data in shifts.items(): + for other_shift in shift_data['cannot follow']: + model += (nurse_view[:,:-1] == SHIFTS.index(shift_id)).implies( + nurse_view[:,1:] != SHIFTS.index(other_shift)) + + # Maximum number of shifts of each type that can be assigned to each employee. + for i, nurse in enumerate(staff): + for shift_id in shift_ids: + max_shifts = nurse[f"max_shifts_{shift_id}"] + model += cp.Count(nurse_view[i], SHIFTS.index(shift_id)) <= max_shifts + + # Minimum and maximum amount of total time in minutes that can be assigned to each employee. + shift_length = cp.cpm_array([0] + [shifts[sid]['Length'] for sid in shift_ids]) # FREE = length 0 + for i, nurse in enumerate(staff): + time_worked = cp.sum(shift_length[nurse_view[i,d]] for d in range(horizon)) + model += time_worked <= nurse.get('MaxTotalMinutes') + model += time_worked >= nurse.get('MinTotalMinutes') + + # Maximum number of consecutive shifts that can be worked before having a day off. + for i, nurse in enumerate(staff): + max_days = nurse.get('MaxConsecutiveShifts') + for d in range(horizon - max_days): + window = nurse_view[i,d:d+max_days+1] + model += cp.Count(window, FREE) >= 1 # at least one holiday in this window + + # Minimum number of consecutive shifts that must be worked before having a day off. + for i, nurse in enumerate(staff): + min_days = nurse.get('MinConsecutiveShifts') + for d in range(1, horizon): + is_start_of_working_period = (nurse_view[i, d-1] == FREE) & (nurse_view[i, d] != FREE) + model += is_start_of_working_period.implies(cp.all(nurse_view[i,d:d+min_days] != FREE)) + + # Minimum number of consecutive days off. + for i, nurse in enumerate(staff): + min_days = nurse.get('MinConsecutiveDaysOff') + for d in range(1, horizon): + is_start_of_free_period = (nurse_view[i, d - 1] != FREE) & (nurse_view[i, d] == FREE) + model += is_start_of_free_period.implies(cp.all(nurse_view[i, d:d + min_days] == FREE)) + + # Max number of working weekends for each nurse + weekends = [(i - 1, i) for i in range(1, horizon) if (i + 1) % 7 == 0] + for i, nurse in enumerate(staff): + n_weekends = cp.sum((nurse_view[i,sat] != FREE) | (nurse_view[i,sun] != FREE) for sat,sun in weekends) + model += n_weekends <= nurse.get('MaxWeekends') + + # Days off + for holiday in days_off: + i = next((idx for idx, nurse in enumerate(staff) if nurse['ID'] == holiday['EmployeeID']), None) # index of employee + model += nurse_view[i,holiday['DayIndex']] == FREE + + # Shift requests, encode in linear objective + objective = 0 + for request in shift_on: + i = next((idx for idx, nurse in enumerate(staff) if nurse['ID'] == request['EmployeeID']), None) # index of employee + cpm_request = nurse_view[i, request['Day']] == SHIFTS.index(request['ShiftID']) + objective += request['Weight'] * ~cpm_request + + # Shift off requests, encode in linear objective + for request in shift_off: + i = next((idx for idx, nurse in enumerate(staff) if nurse['ID'] == request['EmployeeID']), None) # index of employee + cpm_request = nurse_view[i, request['Day']] != SHIFTS.index(request['ShiftID']) + objective += request['Weight'] * ~cpm_request + + # Cover constraints, encode in objective with slack variables + for cover_request in cover: + nb_nurses = cp.Count(nurse_view[:, cover_request['Day']], SHIFTS.index(cover_request['ShiftID'])) + slack_over, slack_under = cp.intvar(0, len(staff), shape=2) + model += nb_nurses - slack_over + slack_under == cover_request["Requirement"] + objective += cover_request["Weight for over"] * slack_over + cover_request["Weight for under"] * slack_under + + model.minimize(objective) + + return model, nurse_view + +if __name__ == "__main__": + dataset = NurseRosteringDataset(root=".", download=True, transform=parse_scheduling_period) + print("Dataset size:", len(dataset)) + + data, metadata = dataset[0] + print(data) + + model, nurse_view = nurserostering_model(**data) + assert model.solve() + + print(f"Found optimal solution with penalty of {model.objective_value()}") + assert model.objective_value() == 607 # optimal solution for the first instance + + # --- Pretty print solution without pandas --- + + horizon = data['horizon'] + shift_ids = list(data['shifts'].keys()) + names = ["-"] + shift_ids + sol = nurse_view.value() + + # Create table: rows are nurses + cover rows, columns are days + table = [] + row_labels = [] + + # Add nurse rows + for i, nurse in enumerate(data['staff']): + nurse_name = nurse.get('name', nurse.get('ID', f'Nurse_{i}')) + row_labels.append(nurse_name) + table.append([names[sol[i][d]] for d in range(horizon)]) + + # Add cover rows (initialize with empty strings) + for shift_id in shift_ids: + row_labels.append(f'Cover {shift_id}') + table.append([''] * horizon) + + # Fill in cover information + for cover_request in data['cover']: + shift = cover_request['ShiftID'] + day = cover_request['Day'] + requirement = cover_request['Requirement'] + # Count how many nurses are assigned to this shift on this day + num_shifts = sum(1 for i in range(len(data['staff'])) + if sol[i][day] == shift_ids.index(shift) + 1) # +1 because 0 is FREE + cover_row_idx = len(data['staff']) + shift_ids.index(shift) + table[cover_row_idx][day] = f"{num_shifts}/{requirement}" + + # Print table + days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] + day_labels = [days[d % 7] for d in range(horizon)] + + # Calculate column widths + col_widths = [max(len(str(row[i])) for row in table + [day_labels]) for i in range(horizon)] + row_label_width = max(len(label) for label in row_labels) + + # Print header + print(f"\n{'Schedule:':<{row_label_width}}", end="") + for d, day_label in enumerate(day_labels): + print(f" {day_label:>{col_widths[d]}}", end="") + print() + + # Print separator + print("-" * (row_label_width + 1 + sum(w + 1 for w in col_widths))) + + # Print rows + for label, row in zip(row_labels, table): + print(f"{label:<{row_label_width}}", end="") + for d, val in enumerate(row): + print(f" {str(val):>{col_widths[d]}}", end="") + print() diff --git a/examples/nurserostering.py b/examples/nurserostering.py index ce424d667..6a1ef7921 100644 --- a/examples/nurserostering.py +++ b/examples/nurserostering.py @@ -1,292 +1,124 @@ """ -PyTorch-style Dataset for Nurserostering instances from schedulingbenchmarks.org +Example usage of the Nurserostering Dataset from schedulingbenchmarks.org -Simply create a dataset instance and start iterating over its contents: -The `metadata` contains usefull information about the current problem instance. +This example demonstrates how to use the dataset loader to parse and solve +nurserostering instances. + +https://schedulingbenchmarks.org/nrp/ """ -import copy -import pathlib -from io import StringIO -from os.path import join -from typing import Tuple, Any -from urllib.request import urlretrieve -from urllib.error import HTTPError, URLError -import zipfile -import pandas as pd -try: - from faker import Faker -except ImportError as e: - print("Install `faker` package using `pip install faker`") - raise e +from cpmpy.tools.dataset.problem.nurserostering import ( + NurseRosteringDataset, + parse_scheduling_period, + nurserostering_model, + to_dataframes +) + try: from natsort import natsorted -except ImportError as e: - print("Install `natsort` package using `pip install natsort`") - raise e - -pd.set_option('display.max_columns', 500) -pd.set_option('display.width', 5000) - -import cpmpy as cp - -class NurseRosteringDataset(object): # torch.utils.data.Dataset compatible - - """ - Nurserostering Dataset in a PyTorch compatible format. - - Arguments: - root (str): Root directory containing the nurserostering instances (if 'download', instances will be downloaded to this location) - transform (callable, optional): Optional transform to be applied on the instance data - target_transform (callable, optional): Optional transform to be applied on the file path - download (bool): If True, downloads the dataset from the internet and puts it in `root` directory - """ - - def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False): - """ - Initialize the Nurserostering Dataset. - """ - - self.root = pathlib.Path(root) - self.instance_dir = pathlib.Path(join(self.root, "nurserostering")) - self.transform = transform - self.target_transform = target_transform - - # Create root directory if it doesn't exist - self.root.mkdir(parents=True, exist_ok=True) - - if not self.instance_dir.exists(): - if not download: - raise ValueError(f"Dataset not found in local file system. Please set download=True to download the dataset.") - else: - url = f"https://schedulingbenchmarks.org/nrp/data/instances1_24.zip" # download full repo... - zip_path = pathlib.Path(join(root,"jsplib-master.zip")) - - print(f"Downloading Nurserostering instances from schedulingbenchmarks.org") - - try: - urlretrieve(url, str(zip_path)) - except (HTTPError, URLError) as e: - raise ValueError(f"No dataset available on {url}. Error: {str(e)}") - - # make directory and extract files - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - self.instance_dir.mkdir(parents=True, exist_ok=True) - - # Extract files - for file_info in zip_ref.infolist(): - filename = pathlib.Path(file_info.filename).name - with zip_ref.open(file_info) as source, open(self.instance_dir / filename, 'wb') as target: - target.write(source.read()) - - # Clean up the zip file - zip_path.unlink() - - - def __len__(self) -> int: - """Return the total number of instances.""" - return len(list(self.instance_dir.glob("*.txt"))) - - def __getitem__(self, index: int) -> Tuple[Any, Any]: - """ - Get a single Nurserostering instance filename and metadata. - - Args: - index (int): Index of the instance to retrieve - - Returns: - Tuple[Any, Any]: A tuple containing: - - The filename of the instance - - Metadata dictionary with file name, track, year etc. - """ - if isinstance(index, int) and not (0 <= index < len(self)): - raise IndexError("Index out of range") - - # Get all instance files and sort for deterministic behavior - files = natsorted(list(self.instance_dir.glob("*.txt"))) # use .txt files instead of xml files - file_path = files[index] - - filename = str(file_path) - if self.transform: - # user might want to process the filename to something else - filename = self.transform(filename) - - metadata = dict(name=file_path.stem) - - if self.target_transform: - metadata = self.target_transform(metadata) - - return filename, metadata - - -import re -def _tag_to_data(string, tag, skip_lines=0, datatype=pd.DataFrame, *args, **kwargs): - - regex = rf'{tag}[\s\S]*?($|(?=\n\s*\n))' - match = re.search(regex, string) - - data = "\n".join(match.group().split("\n")[skip_lines+1:]) - if datatype == pd.DataFrame: - kwargs = {"header":0, "index_col":0} | kwargs - df = pd.read_csv(StringIO(data), *args, **kwargs) - return df.rename(columns=lambda x: x.replace("#","").strip()) - return datatype(data, *args, **kwargs) - -def parse_scheduling_period(fname): - fake = Faker() - fake.seed_instance(0) - - with open(fname, "r") as f: - string = f.read() + sort_key = natsorted +except ImportError: + sort_key = None # Use default sorted() - - horizon = _tag_to_data(string, "SECTION_HORIZON", skip_lines=2, datatype=int) - shifts = _tag_to_data(string, "SECTION_SHIFTS", names=["ShiftID", "Length", "cannot follow"], - dtype={'ShiftID':str, 'Length':int, 'cannot follow':str}) - shifts.fillna("", inplace=True) - shifts["cannot follow"] = shifts["cannot follow"].apply(lambda val : [v.strip() for v in val.split("|") if len(v.strip())]) - - staff = _tag_to_data(string, "SECTION_STAFF", index_col=False) - maxes = staff["MaxShifts"].str.split("|", expand=True) - for col in maxes: - shift_id = maxes[col].iloc[0].split("=")[0] - column = maxes[col].apply(lambda x : x.split("=")[1]) - staff[f"max_shifts_{shift_id}"] = column.astype(int) - - staff["name"] = [fake.unique.first_name() for _ in staff.index] - - days_off = _tag_to_data(string, "SECTION_DAYS_OFF", datatype=str) - # process string to be EmployeeID, Day off for each line - rows = [] - for line in days_off.split("\n")[1:]: - employee_id , *days = line.split(",") - rows += [dict(EmployeeID=employee_id, DayIndex= int(d)) for d in days] - days_off = pd.DataFrame(rows) - - - shift_on = _tag_to_data(string, "SECTION_SHIFT_ON_REQUESTS", index_col=False) - shift_off = _tag_to_data(string, "SECTION_SHIFT_OFF_REQUESTS", index_col=False) - cover = _tag_to_data(string, "SECTION_COVER", index_col=False) - - return dict(horizon=horizon, shifts=shifts, staff=staff, days_off=days_off, shift_on=shift_on, shift_off=shift_off, cover=cover) - - -def nurserostering_model(horizon, shifts:pd.DataFrame, staff, days_off, shift_on, shift_off, cover): - - n_nurses = len(staff) - - FREE = 0 - SHIFTS = ["F"] + list(shifts.index) - - nurse_view = cp.intvar(0,len(shifts), shape=(n_nurses, horizon), name="nv") - - model = cp.Model() - - # Shifts which cannot follow the shift on the previous day. - for id, shift in shifts.iterrows(): - for other_shift in shift['cannot follow']: - model += (nurse_view[:,:-1] == SHIFTS.index(id)).implies(nurse_view[:,1:] != SHIFTS.index(other_shift)) - - # Maximum number of shifts of each type that can be assigned to each employee. - for i, nurse in staff.iterrows(): - for shift_id, shift in shifts.iterrows(): - max_shifts = nurse[f"max_shifts_{shift_id}"] - model += cp.Count(nurse_view[i], SHIFTS.index(shift_id)) <= max_shifts - - # Minimum and maximum amount of total time in minutes that can be assigned to each employee. - shift_length = cp.cpm_array([0] + shifts['Length'].tolist()) # FREE = length 0 - for i, nurse in staff.iterrows(): - time_worked = cp.sum(shift_length[nurse_view[i,d]] for d in range(horizon)) - model += time_worked <= nurse['MaxTotalMinutes'] - model += time_worked >= nurse['MinTotalMinutes'] - - # Maximum number of consecutive shifts that can be worked before having a day off. - for i, nurse in staff.iterrows(): - max_days = nurse['MaxConsecutiveShifts'] - for d in range(horizon - max_days): - window = nurse_view[i,d:d+max_days+1] - model += cp.Count(window, FREE) >= 1 # at least one holiday in this window - - # Minimum number of concecutive shifts that must be worked before having a day off. - for i, nurse in staff.iterrows(): - min_days = nurse['MinConsecutiveShifts'] - for d in range(1,horizon): - is_start_of_working_period = (nurse_view[i, d-1] == FREE) & (nurse_view[i, d] != FREE) - model += is_start_of_working_period.implies(cp.all(nurse_view[i,d:d+min_days] != FREE)) - - # Minimum number of concecutive days off. - for i, nurse in staff.iterrows(): - min_days = nurse['MinConsecutiveDaysOff'] - for d in range(1,horizon): - is_start_of_free_period = (nurse_view[i, d - 1] != FREE) & (nurse_view[i, d] == FREE) - model += is_start_of_free_period.implies(cp.all(nurse_view[i, d:d + min_days] == FREE)) - - # Max number of working weekends for each nurse - weekends = [(i - 1, i) for i in range(1,horizon) if (i + 1) % 7 == 0] - for i, nurse in staff.iterrows(): - n_weekends = cp.sum((nurse_view[i,sat] != FREE) | (nurse_view[i,sun] != FREE) for sat,sun in weekends) - model += n_weekends <= nurse['MaxWeekends'] - - # Days off - for _, holiday in days_off.iterrows(): # could also do this vectorized... TODO? - i = (staff['ID'] == holiday['EmployeeID']).argmax() # index of employee - model += nurse_view[i,holiday['DayIndex']] == FREE - - # Shift requests, encode in linear objective - objective = 0 - for _, request in shift_on.iterrows(): - i = (staff['ID'] == request['EmployeeID']).argmax() # index of employee - cpm_request = nurse_view[i, request['Day']] == SHIFTS.index(request['ShiftID']) - objective += request['Weight'] * ~cpm_request - - # Shift off requests, encode in linear objective - for _, request in shift_off.iterrows(): - i = (staff['ID'] == request['EmployeeID']).argmax() # index of employee - cpm_request = nurse_view[i, request['Day']] != SHIFTS.index(request['ShiftID']) - objective += request['Weight'] * ~cpm_request - - # Cover constraints, encode in objective with slack variables - for _, cover_request in cover.iterrows(): - nb_nurses = cp.Count(nurse_view[:, cover_request['Day']], SHIFTS.index(cover_request['ShiftID'])) - slack_over, slack_under = cp.intvar(0, len(staff), shape=2) - model += nb_nurses - slack_over + slack_under == cover_request["Requirement"] - - objective += cover_request["Weight for over"] * slack_over + cover_request["Weight for under"] * slack_under - - model.minimize(objective) - - return model, nurse_view +try: + import pandas as pd + pd.set_option('display.max_columns', 500) + pd.set_option('display.width', 5000) + HAS_PANDAS = True +except ImportError: + HAS_PANDAS = False if __name__ == "__main__": - - dataset = NurseRosteringDataset(root=".", download=True, transform=parse_scheduling_period) + # Example 1: Basic usage with native data structures + dataset = NurseRosteringDataset(root=".", download=True, transform=parse_scheduling_period, sort_key=sort_key) print("Dataset size:", len(dataset)) data, metadata = dataset[0] - for key, value in data.items(): - print(key,":") - print(value) + print(f"Instance: {metadata['name']}") + print(f"Horizon: {data['horizon']} days") + print(f"Number of nurses: {len(data['staff'])}") + print(f"Number of shifts: {len(data['shifts'])}") + # Solve the model model, nurse_view = nurserostering_model(**data) assert model.solve() - print(f"Found optimal solution with penalty of {model.objective_value()}") - assert model.objective_value() == 607 # optimal solution for the first instance + print(f"\nFound optimal solution with penalty of {model.objective_value()}") + assert model.objective_value() == 607 # optimal solution for the first instance - # pretty print solution - names = ["-"] + data['shifts'].index.tolist() + # Pretty print solution (native Python, no pandas required) + horizon = data['horizon'] + shift_ids = list(data['shifts'].keys()) + names = ["-"] + shift_ids sol = nurse_view.value() - df = pd.DataFrame(sol, index=data['staff'].name).map(names.__getitem__) - - for shift, _ in data['shifts'].iterrows(): - df.loc[f'Cover {shift}'] = "" - - for _, cover_request in data['cover'].iterrows(): + + # Create table: rows are nurses + cover rows, columns are days + table = [] + row_labels = [] + + # Add nurse rows + for i, nurse in enumerate(data['staff']): + nurse_name = nurse.get('name', nurse.get('ID', f'Nurse_{i}')) + row_labels.append(nurse_name) + table.append([names[sol[i][d]] for d in range(horizon)]) + + # Add cover rows (initialize with empty strings) + for shift_id in shift_ids: + row_labels.append(f'Cover {shift_id}') + table.append([''] * horizon) + + # Fill in cover information + for cover_request in data['cover']: shift = cover_request['ShiftID'] - num_shifts = sum(df[cover_request['Day']] == shift) - df.loc[f"Cover {shift}",cover_request['Day']] = f"{num_shifts}/{cover_request['Requirement']}" - + day = cover_request['Day'] + requirement = cover_request['Requirement'] + # Count how many nurses are assigned to this shift on this day + num_shifts = sum(1 for i in range(len(data['staff'])) + if sol[i][day] == shift_ids.index(shift) + 1) # +1 because 0 is FREE + cover_row_idx = len(data['staff']) + shift_ids.index(shift) + table[cover_row_idx][day] = f"{num_shifts}/{requirement}" + + # Print table days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] - df.columns = [days[(int(col)) % 7] for col in df.columns] - - print(df.to_markdown()) + day_labels = [days[d % 7] for d in range(horizon)] + + # Calculate column widths + col_widths = [max(len(str(row[i])) for row in table + [day_labels]) for i in range(horizon)] + row_label_width = max(len(label) for label in row_labels) + + # Print header + print(f"\n{'Schedule:':<{row_label_width}}", end="") + for d, day_label in enumerate(day_labels): + print(f" {day_label:>{col_widths[d]}}", end="") + print() + + # Print separator + print("-" * (row_label_width + 1 + sum(w + 1 for w in col_widths))) + + # Print rows + for label, row in zip(row_labels, table): + print(f"{label:<{row_label_width}}", end="") + for d, val in enumerate(row): + print(f" {str(val):>{col_widths[d]}}", end="") + print() + + # Example 2: Using pandas DataFrames (optional) + if HAS_PANDAS: + print("\n" + "="*60) + print("Example with pandas DataFrames:") + print("="*60) + + def parse_with_dataframes(fname): + return to_dataframes(parse_scheduling_period(fname)) + + dataset_df = NurseRosteringDataset(root=".", download=False, transform=parse_with_dataframes, sort_key=sort_key) + data_df, _ = dataset_df[0] + + print("\nStaff DataFrame:") + print(data_df['staff'].head()) + + print("\nShifts DataFrame:") + print(data_df['shifts']) + + print("\nCover DataFrame:") + print(data_df['cover'].head()) From 8b76fd3d3113731ebae37c4b922ba9e934a1f492 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 6 Jan 2026 14:17:18 +0100 Subject: [PATCH 048/152] Remove left-over print statement --- cpmpy/tools/dataset/problem/jsplib.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cpmpy/tools/dataset/problem/jsplib.py b/cpmpy/tools/dataset/problem/jsplib.py index 17453fe32..ea3e88341 100644 --- a/cpmpy/tools/dataset/problem/jsplib.py +++ b/cpmpy/tools/dataset/problem/jsplib.py @@ -46,7 +46,6 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl # Create root directory if it doesn't exist self.root.mkdir(parents=True, exist_ok=True) - print(self.instance_dir, self.instance_dir.exists(), self.instance_dir.is_dir()) if not self.instance_dir.exists(): if not download: raise ValueError(f"Dataset not found in local file system. Please set download=True to download the dataset.") From e59fa996df3333c586871986f2cfb5d9cc270227 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 6 Jan 2026 14:46:07 +0100 Subject: [PATCH 049/152] small docstring --- cpmpy/tools/dataset/problem/nurserostering.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpmpy/tools/dataset/problem/nurserostering.py b/cpmpy/tools/dataset/problem/nurserostering.py index f9f3c0c61..589a373de 100644 --- a/cpmpy/tools/dataset/problem/nurserostering.py +++ b/cpmpy/tools/dataset/problem/nurserostering.py @@ -206,6 +206,9 @@ def parse_scheduling_period(filename: str): """ Parse a nurserostering instance file. + Args: + filename: Path to the nurserostering instance file. + Returns a dictionary with native Python data structures (lists of dicts). Use to_dataframes() transform to convert to pandas DataFrames if needed. Use add_fake_names() transform to add randomly generated names to staff. From c8729af2e94474cb52973cfbc2b8a6360676bd27 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 6 Jan 2026 14:46:26 +0100 Subject: [PATCH 050/152] Nurserostering parser tool --- cpmpy/tools/nurserostering/__init__.py | 21 +++++ cpmpy/tools/nurserostering/parser.py | 106 +++++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 cpmpy/tools/nurserostering/__init__.py create mode 100644 cpmpy/tools/nurserostering/parser.py diff --git a/cpmpy/tools/nurserostering/__init__.py b/cpmpy/tools/nurserostering/__init__.py new file mode 100644 index 000000000..b41d8f604 --- /dev/null +++ b/cpmpy/tools/nurserostering/__init__.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## __init__.py +## +""" +Set of utilities for working with Nurse Rostering-formatted CP models. + + +================== +List of submodules +================== + +.. autosummary:: + :nosignatures: + + parser +""" + +from .parser import read_nurserostering + diff --git a/cpmpy/tools/nurserostering/parser.py b/cpmpy/tools/nurserostering/parser.py new file mode 100644 index 000000000..17547faf9 --- /dev/null +++ b/cpmpy/tools/nurserostering/parser.py @@ -0,0 +1,106 @@ +""" +Parser for the Nurse Rostering format. + + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read_nurserostering +""" + + +import os +import sys +import argparse +import tempfile +import cpmpy as cp +from typing import Union + +from cpmpy.tools.dataset.problem.nurserostering import ( + parse_scheduling_period, + nurserostering_model +) + + +_std_open = open +def read_nurserostering(instance: Union[str, os.PathLike], open=open) -> cp.Model: + """ + Parser for Nurse Rostering format. Reads in an instance and returns its matching CPMpy model. + + Arguments: + instance (str or os.PathLike): + - A file path to a Nurse Rostering file + - OR a string containing the Nurse Rostering content directly + open (callable): + If instance is the path to a file, a callable to "open" that file (default=python standard library's 'open'). + + Returns: + cp.Model: The CPMpy model of the Nurse Rostering instance. + """ + # If instance is a path to a file that exists -> use it directly + if isinstance(instance, (str, os.PathLike)) and os.path.exists(instance): + fname = instance + # If instance is a string containing file content -> write to temp file + else: + # Create a temporary file and write the content + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as tmp: + tmp.write(instance) + fname = tmp.name + + try: + # Use the existing parser from the dataset (expects a file path) + data = parse_scheduling_period(fname) + + # Create the CPMpy model using the existing model builder + model, _ = nurserostering_model(**data) + + return model + finally: + # Clean up temporary file if we created one + if isinstance(instance, str) and not os.path.exists(instance) and os.path.exists(fname): + os.unlink(fname) + + +def main(): + parser = argparse.ArgumentParser(description="Parse and solve a Nurse Rostering model using CPMpy") + parser.add_argument("model", help="Path to a Nurse Rostering file (or raw content string if --string is given)") + parser.add_argument("-s", "--solver", default=None, help="Solver name to use (default: CPMpy's default)") + parser.add_argument("--string", action="store_true", help="Interpret the first argument (model) as a raw Nurse Rostering string instead of a file path") + parser.add_argument("-t", "--time-limit", type=int, default=None, help="Time limit for the solver in seconds (default: no limit)") + args = parser.parse_args() + + # Build the CPMpy model + try: + if args.string: + model = read_nurserostering(args.model) + else: + model = read_nurserostering(os.path.expanduser(args.model)) + except Exception as e: + sys.stderr.write(f"Error reading model: {e}\n") + sys.exit(1) + + # Solve the model + try: + if args.solver: + result = model.solve(solver=args.solver, time_limit=args.time_limit) + else: + result = model.solve(time_limit=args.time_limit) + except Exception as e: + sys.stderr.write(f"Error solving model: {e}\n") + sys.exit(1) + + # Print results + print("Status:", model.status()) + if result is not None: + if model.has_objective(): + print("Objective:", model.objective_value()) + else: + print("No solution found.") + +if __name__ == "__main__": + main() + From dd8fe9b01d6d9c393254148a6caa5b8659eb825f Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 6 Jan 2026 14:46:36 +0100 Subject: [PATCH 051/152] Nurserostering benchmark --- cpmpy/tools/benchmark/nurserostering.py | 211 ++++++++++++++++++++++++ 1 file changed, 211 insertions(+) create mode 100644 cpmpy/tools/benchmark/nurserostering.py diff --git a/cpmpy/tools/benchmark/nurserostering.py b/cpmpy/tools/benchmark/nurserostering.py new file mode 100644 index 000000000..31976f229 --- /dev/null +++ b/cpmpy/tools/benchmark/nurserostering.py @@ -0,0 +1,211 @@ +""" +Nurse Rostering as a CPMpy benchmark + +This module provides a benchmarking framework for running CPMpy on Nurse Rostering +instances from schedulingbenchmarks.org. + +Command-line Interface +---------------------- +This script can be run directly to benchmark solvers on Nurse Rostering datasets. + +Usage: + python nurserostering.py --solver ortools + +Arguments: + --solver Solver name (e.g., ortools, exact, choco, ...). + --workers Number of parallel workers to use. + --time-limit Time limit in seconds per instance. + --mem-limit Memory limit in MB per instance. + --cores Number of cores to assign to a single instance. + --output-dir Output directory for CSV files. + --verbose Show solver output if set. + --intermediate Report intermediate solutions if supported. + +=============== +List of classes +=============== + +.. autosummary:: + :nosignatures: + + NurseRosteringBenchmark + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + solution_nurserostering +""" + +import warnings +import argparse +from pathlib import Path +from datetime import datetime + +# CPMpy +from cpmpy.tools.benchmark.runner import benchmark_runner +from cpmpy.tools.benchmark._base import Benchmark, ExitStatus +from cpmpy.tools.nurserostering import read_nurserostering +from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus + + +def solution_nurserostering(model): + """ + Convert a CPMpy model solution into the solution string format. + + Arguments: + model (cp.solvers.SolverInterface): The solver-specific model for which to print its solution + + Returns: + str: formatted solution string. + """ + variables = {var.name: var.value() for var in model.user_vars if var.name[:2] not in ["IV", "BV", "B#"]} + return str(variables) + + +class NurseRosteringBenchmark(Benchmark): + + """ + Nurse Rostering as a CPMpy benchmark. + """ + + def __init__(self): + self.sol_time = None + super().__init__(reader=read_nurserostering) + + def print_comment(self, comment:str): + print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) + + def print_status(self, status: ExitStatus) -> None: + print('s' + chr(32) + status.value, end="\n", flush=True) + + def print_value(self, value: str) -> None: + print('v' + chr(32) + value, end="\n", flush=True) + + def print_objective(self, objective: int) -> None: + print('o' + chr(32) + str(objective), end="\n", flush=True) + + def print_intermediate(self, objective:int): + self.print_objective(objective) + + def print_result(self, s): + if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_objective(s.objective_value()) + self.print_value(solution_nurserostering(s)) + self.print_status(ExitStatus.optimal) + elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_objective(s.objective_value()) + self.print_value(solution_nurserostering(s)) + self.print_status(ExitStatus.sat) + elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: + self.print_status(ExitStatus.unsat) + else: + self.print_comment("Solver did not find any solution within the time/memory limit") + self.print_status(ExitStatus.unknown) + + def handle_memory_error(self, mem_limit): + super().handle_memory_error(mem_limit) + self.print_status(ExitStatus.unknown) + + def handle_not_implemented(self, e): + super().handle_not_implemented(e) + self.print_status(ExitStatus.unsupported) + + def handle_exception(self, e): + super().handle_exception(e) + self.print_status(ExitStatus.unknown) + + + def handle_sigterm(self): + """ + Handles a SIGTERM. Gives us 1 second to finish the current job before we get killed. + """ + # Report that we haven't found a solution in time + self.print_status(ExitStatus.unknown) + self.print_comment("SIGTERM raised.") + return 0 + + def handle_rlimit_cpu(self): + """ + Handles a SIGXCPU. + """ + # Report that we haven't found a solution in time + self.print_status(ExitStatus.unknown) + self.print_comment("SIGXCPU raised.") + return 0 + + def parse_output_line(self, line, result): + if line.startswith('s '): + result['status'] = line[2:].strip() + elif line.startswith('v '): + # only record first line, contains 'type' and 'cost' + solution = line.split("\n")[0][2:].strip() + if solution not in result: + result['solution'] = solution + else: + result['solution'] = result['solution'] + ' ' + str(solution) + elif line.startswith('c Solution'): + parts = line.split(', time = ') + # Get solution time from comment for intermediate solution -> used for annotating 'o ...' lines + self.sol_time = float(parts[-1].replace('s', '').rstrip()) + elif line.startswith('o '): + obj = int(line[2:].strip()) + if result['intermediate'] is None: + result['intermediate'] = [] + if self.sol_time is not None: + result['intermediate'] += [(self.sol_time, obj)] + result['objective_value'] = obj + obj = None + elif line.startswith('c took '): + # Parse timing information + parts = line.split(' seconds to ') + if len(parts) == 2: + time_val = float(parts[0].replace('c took ', '')) + action = parts[1].strip() + if action.startswith('parse'): + result['time_parse'] = time_val + elif action.startswith('convert'): + result['time_model'] = time_val + elif action.startswith('post'): + result['time_post'] = time_val + elif action.startswith('solve'): + result['time_solve'] = time_val + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description='Benchmark solvers on Nurse Rostering instances') + parser.add_argument('--solver', type=str, required=True, help='Solver name (e.g., ortools, exact, choco, ...)') + parser.add_argument('--workers', type=int, default=4, help='Number of parallel workers') + parser.add_argument('--time-limit', type=int, default=300, help='Time limit in seconds per instance') + parser.add_argument('--mem-limit', type=int, default=8192, help='Memory limit in MB per instance') + parser.add_argument('--cores', type=int, default=1, help='Number of cores to assign to a single instance') + parser.add_argument('--output-dir', type=str, default='results', help='Output directory for CSV files') + parser.add_argument('--verbose', action='store_true', help='Show solver output') + parser.add_argument('--intermediate', action='store_true', help='Report on intermediate solutions') + args = parser.parse_args() + + if not args.verbose: + warnings.filterwarnings("ignore") + + # Load benchmark instances (as a dataset) + from cpmpy.tools.dataset.problem.nurserostering import NurseRosteringDataset + dataset = NurseRosteringDataset(root=".", download=True) + + # Create output directory + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + # Get current timestamp in a filename-safe format + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Define output file path with timestamp + output_file = str(output_dir / "nurserostering" / f"nurserostering_{args.solver}_{timestamp}.csv") + + # Run the benchmark + instance_runner = NurseRosteringBenchmark() + output_file = benchmark_runner(dataset=dataset, instance_runner=instance_runner, output_file=output_file, **vars(args)) + print(f"Results added to {output_file}") + From 30c9e480c056496475494438b76936efcad87c8f Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 6 Jan 2026 19:29:14 +0100 Subject: [PATCH 052/152] ensutre soft is lower --- cpmpy/tools/benchmark/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/benchmark/__init__.py b/cpmpy/tools/benchmark/__init__.py index ce383c1de..1946c0a5b 100644 --- a/cpmpy/tools/benchmark/__init__.py +++ b/cpmpy/tools/benchmark/__init__.py @@ -19,6 +19,7 @@ def set_memory_limit(mem_limit): if mem_limit is not None: soft = max(_mib_as_bytes(mem_limit) - _mib_as_bytes(MEMORY_BUFFER_SOFT), _mib_as_bytes(MEMORY_BUFFER_SOFT)) hard = max(_mib_as_bytes(mem_limit) - _mib_as_bytes(MEMORY_BUFFER_HARD), _mib_as_bytes(MEMORY_BUFFER_HARD)) + soft = min(soft, hard) if sys.platform != "win32": import resource resource.setrlimit(resource.RLIMIT_AS, (soft, hard)) # limit memory in number of bytes @@ -39,7 +40,7 @@ def set_time_limit(time_limit, verbose:bool=False): if time_limit is not None: if sys.platform != "win32": import resource - soft = time_limit + soft = int(time_limit) hard = resource.RLIM_INFINITY resource.setrlimit(resource.RLIMIT_CPU, (soft, hard)) else: From 5eb8b5728d50cdcc796f24993379f3da1d8a49ca Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 6 Jan 2026 19:29:58 +0100 Subject: [PATCH 053/152] make sure path exists --- cpmpy/tools/dataset/model/xcsp3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cpmpy/tools/dataset/model/xcsp3.py b/cpmpy/tools/dataset/model/xcsp3.py index f17a4d193..3a5f51d30 100644 --- a/cpmpy/tools/dataset/model/xcsp3.py +++ b/cpmpy/tools/dataset/model/xcsp3.py @@ -84,6 +84,7 @@ def download(self): year_suffix = str(self.year)[2:] # Drop the starting '20' url_path = url + f"instancesXCSP{year_suffix}.zip" zip_path = self.root / f"instancesXCSP{year_suffix}.zip" + pathlib.Path(self.root).mkdir(parents=True, exist_ok=True) try: urlretrieve(url_path, str(zip_path)) From cef1ab3a3a0da005af146a247b5e4652921fe2e6 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 6 Jan 2026 19:30:30 +0100 Subject: [PATCH 054/152] prototype runner --- cpmpy/tools/benchmark/test/README.md | 211 ++++ cpmpy/tools/benchmark/test/bench_xcsp3.py | 84 ++ cpmpy/tools/benchmark/test/instance_runner.py | 42 + cpmpy/tools/benchmark/test/main.py | 46 + cpmpy/tools/benchmark/test/manager.py | 438 ++++++++ cpmpy/tools/benchmark/test/run_xcsp3.py | 58 ++ cpmpy/tools/benchmark/test/runner.py | 933 ++++++++++++++++++ .../benchmark/test/xcsp3_instance_runner.py | 66 ++ 8 files changed, 1878 insertions(+) create mode 100644 cpmpy/tools/benchmark/test/README.md create mode 100644 cpmpy/tools/benchmark/test/bench_xcsp3.py create mode 100644 cpmpy/tools/benchmark/test/instance_runner.py create mode 100644 cpmpy/tools/benchmark/test/main.py create mode 100644 cpmpy/tools/benchmark/test/manager.py create mode 100644 cpmpy/tools/benchmark/test/run_xcsp3.py create mode 100644 cpmpy/tools/benchmark/test/runner.py create mode 100644 cpmpy/tools/benchmark/test/xcsp3_instance_runner.py diff --git a/cpmpy/tools/benchmark/test/README.md b/cpmpy/tools/benchmark/test/README.md new file mode 100644 index 000000000..a415ece6c --- /dev/null +++ b/cpmpy/tools/benchmark/test/README.md @@ -0,0 +1,211 @@ +# Benchmark Testing Tooling + +python cpmpy/tools/benchmark/test/xcsp3_instance_runner.py data/2024/CSP/AverageAvoiding-20_c24.xml.lzma + + + + + +This directory contains tooling for benchmarking and testing constraint satisfaction problem instances, particularly XCSP3 instances. + +## Overview + +The tooling provides a flexible framework for: +- Running individual problem instances with various solvers +- Managing computational resources (time, memory, CPU cores) +- Collecting detailed profiling and solution information +- Running benchmarks in parallel across multiple instances + +## Components + +### Core Components + +- **`instance_runner.py`**: Base class for instance runners. Provides the interface for running instances with argument parsing and observer registration. + +- **`xcsp3_instance_runner.py`**: Specialized runner for XCSP3 instances. Handles reading compressed (.lzma) and uncompressed XCSP3 files, and sets up appropriate observers for competition-style output. + +- **`runner.py`**: Core execution engine that: + - Reads problem instances + - Transforms them into solver models + - Executes solvers with resource limits + - Manages observers for profiling, solution checking, and output formatting + +- **`manager.py`**: Resource management systems: + - `RunExecResourceManager`: Uses benchexec's RunExecutor for strict resource control (requires benchexec) + - `PythonResourceManager`: Python-based resource management using observers + +### Example Scripts + +- **`main.py`**: Example of running parallel benchmarks on XCSP3 datasets with resource management +- **`bench_xcsp3.py`**: Alternative benchmarking script (deprecated, see `run_xcsp3.py`) +- **`run_xcsp3.py`**: Deprecated script (use `XCSP3InstanceRunner` instead) + +## Usage + +### Running a Single Instance + +The simplest way to run a single XCSP3 instance: + +```bash +python -m cpmpy.tools.benchmark.test.xcsp3_instance_runner [options] +``` + +**Options:** +- `--solver SOLVER`: Solver to use (default: "ortools") +- `--output_file FILE`: Output file path (default: `results/{solver}_{instance}.txt`) +- `--time_limit SECONDS`: Time limit in seconds +- `--mem_limit MB`: Memory limit in MB +- `--seed SEED`: Random seed for solver +- `--intermediate`: Print intermediate solutions +- `--cores N`: Number of CPU cores to use +- `--verbose`: Enable verbose output + +**Example:** +```bash +python -m cpmpy.tools.benchmark.test.xcsp3_instance_runner instance.xml --solver ortools --time_limit 300 --seed 42 +``` + +### Running Multiple Instances in Parallel + +Use `main.py` as a template for running benchmarks on multiple instances: + +```python +from cpmpy.tools.benchmark.test.xcsp3_instance_runner import XCSP3InstanceRunner +from cpmpy.tools.benchmark.test.manager import RunExecResourceManager, run_instance +from cpmpy.tools.dataset.model.xcsp3 import XCSP3Dataset +from concurrent.futures import ProcessPoolExecutor +from queue import Queue + +# Load dataset +dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) + +# Configure resources +time_limit = 600 # 10 minutes +workers = 4 +cores_per_worker = 1 +memory_limit = 8000 # MB per worker + +# Initialize managers +resource_manager = RunExecResourceManager() +instance_runner = XCSP3InstanceRunner() + +# Create job queue +job_queue = Queue() +for instance, metadata in dataset: + job_queue.put((instance, metadata)) + +# Run with parallel workers +with ProcessPoolExecutor(max_workers=workers) as executor: + # ... worker setup code ... +``` + +### Using Resource Managers + +#### RunExecResourceManager (Recommended) + +Uses benchexec's RunExecutor for strict resource control. Requires `benchexec` to be installed. + +```python +from cpmpy.tools.benchmark.test.manager import RunExecResourceManager, XCSP3InstanceRunner + +resource_manager = RunExecResourceManager() +runner = XCSP3InstanceRunner() + +resource_manager.run( + instance="instance.xml", + runner=runner, + time_limit=300, + memory_limit=4000, + cores=[0, 1] # Use cores 0 and 1 +) +``` + +#### PythonResourceManager + +Python-based resource management using observers. Less strict but doesn't require external dependencies. + +```python +from cpmpy.tools.benchmark.test.manager import PythonResourceManager, XCSP3InstanceRunner + +resource_manager = PythonResourceManager() +runner = XCSP3InstanceRunner() + +resource_manager.run( + instance="instance.xml", + runner=runner, + time_limit=300, + memory_limit=4000, + cores=[0, 1] +) +``` + +### Using the Manager CLI + +The `manager.py` script provides a command-line interface: + +```bash +python -m cpmpy.tools.benchmark.test.manager \ + --instance instance.xml \ + --time_limit 300 \ + --memory_limit 4000 \ + --cores 0,1 \ + --runner xcsp3 \ + --resource_manager runexec +``` + +**Options:** +- `--instance PATH`: Path to instance file (required) +- `--time_limit SECONDS`: Time limit in seconds +- `--memory_limit MB`: Memory limit in MB +- `--cores LIST`: Comma-separated list of core IDs (e.g., "0,1,2") +- `--runner RUNNER`: Runner to use (default: "xcsp3") +- `--resource_manager MANAGER`: Resource manager ("runexec" or "python", default: "runexec") + +## Observers + +The runner system uses observers to collect information and format output: + +- **`CompetitionPrintingObserver`**: Prints competition-style output (s, v, c lines) +- **`ProfilingObserver`**: Collects timing and resource usage statistics +- **`HandlerObserver`**: Handles exceptions and errors +- **`SolverArgsObserver`**: Logs solver arguments +- **`SolutionCheckerObserver`**: Validates solutions +- **`ResourceLimitObserver`**: Monitors and enforces resource limits + +Observers are automatically registered by `XCSP3InstanceRunner`. To add custom observers: + +```python +from cpmpy.tools.benchmark.test.runner import YourCustomObserver + +runner = XCSP3InstanceRunner() +runner.register_observer(YourCustomObserver()) +runner.run(instance="instance.xml") +``` + +## Output Format + +The tooling produces competition-style output: + +- `c `: Comment lines +- `s `: Solution status (SATISFIABLE, UNSATISFIABLE, UNKNOWN) +- `v `: Variable assignments (if solution found) +- `o `: Objective value (for optimization problems) + +Output is written to the specified output file (default: `results/{solver}_{instance}.txt`). + +## Supported File Formats + +- **XCSP3**: XML-based constraint satisfaction problem format +- **Compressed**: Supports `.lzma` compressed XCSP3 files (automatically detected) + +## Dependencies + +- **Required**: cpmpy, standard Python libraries +- **Optional**: benchexec (for `RunExecResourceManager`) + +## Examples + +See `main.py` and `bench_xcsp3.py` for complete examples of running benchmarks. + + + diff --git a/cpmpy/tools/benchmark/test/bench_xcsp3.py b/cpmpy/tools/benchmark/test/bench_xcsp3.py new file mode 100644 index 000000000..e8a1caa30 --- /dev/null +++ b/cpmpy/tools/benchmark/test/bench_xcsp3.py @@ -0,0 +1,84 @@ +from concurrent.futures import ProcessPoolExecutor +from multiprocessing import Manager + +from cpmpy.tools.benchmark.test.manager import RunExecResourceManager, run_instance +from cpmpy.tools.benchmark.test.xcsp3_instance_runner import XCSP3InstanceRunner + + +def worker_function(worker_id, cores, job_queue, time_limit, memory_limit): + """Worker function that picks jobs from the queue until it's empty.""" + # Recreate instances in each worker process (they may not be picklable) + resource_manager = RunExecResourceManager() + instance_runner = XCSP3InstanceRunner() + + while True: + try: + # Get a job from the queue (blocks until one is available) + instance, metadata = job_queue.get_nowait() + except Exception: + # Queue is empty, worker is done + break + + # Run the instance with this worker's assigned cores + run_instance(instance, instance_runner, time_limit, memory_limit, cores, resource_manager) + job_queue.task_done() + + +def main(): + from cpmpy.tools.dataset.model.xcsp3 import XCSP3Dataset + + # dataset = XCSP3Dataset(root="./data", year=2025, track="CSP25", download=True) + # dataset = OPBDataset(root="./data", year=2024, track="DEC-LIN", download=True) + # dataset = JSPLibDataset(root="./data", download=True) + dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) + + time_limit = 10*60 + workers = 1 + cores_per_worker = 1 + total_memory = 25000 + memory_per_worker = total_memory // workers + memory_limit = memory_per_worker# Bytes to MB + # resource_manager = RunExecResourceManager() + # instance_runner = XCSP3InstanceRunner() + + # Calculate core assignments for each worker + # Each worker gets a fixed set of consecutive cores + import psutil + total_cores = psutil.cpu_count(logical=False) # physical cores + # total_cores = psutil.cpu_count(logical=True) # logical cores (with hyperthreading) + + if workers * cores_per_worker > total_cores: + raise ValueError(f"Not enough cores: {workers} workers × {cores_per_worker} cores = {workers * cores_per_worker} cores needed, but only {total_cores} available") + + # Assign cores to each worker + worker_cores = [] + for i in range(workers): + start_core = i * cores_per_worker + end_core = start_core + cores_per_worker + cores = list(range(start_core, end_core)) + worker_cores.append(cores) + + print(f"Total cores: {total_cores}, Workers: {workers}, Cores per worker: {cores_per_worker}") + for i, cores in enumerate(worker_cores): + print(f"Worker {i}: cores {cores}") + + # Create a queue of all jobs using Manager for ProcessPoolExecutor compatibility + with Manager() as manager: + job_queue = manager.Queue() + for instance, metadata in dataset: + job_queue.put((instance, metadata)) + + # Submit workers to the executor + with ProcessPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit(worker_function, worker_id, cores, job_queue, time_limit, memory_limit) + for worker_id, cores in enumerate(worker_cores) + ] + # Wait for all workers to finish + for future in futures: + future.result() + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cpmpy/tools/benchmark/test/instance_runner.py b/cpmpy/tools/benchmark/test/instance_runner.py new file mode 100644 index 000000000..3b1122851 --- /dev/null +++ b/cpmpy/tools/benchmark/test/instance_runner.py @@ -0,0 +1,42 @@ +import argparse +import lzma +from pathlib import Path +from functools import partial + +from cpmpy.tools.benchmark.test.runner import Runner, CompetitionPrintingObserver, ProfilingObserver, HandlerObserver, SolverArgsObserver, SolutionCheckerObserver, WriteToFileObserver +from cpmpy.tools.dataset.model.xcsp3 import XCSP3Dataset +from cpmpy.tools.xcsp3 import read_xcsp3 + +class InstanceRunner: + + def __init__(self): + self.additional_observers = [] + + def cmd(self, instance: str): + pass + + def argparser(self): + parser = argparse.ArgumentParser() + parser.add_argument("instance", type=str) + parser.add_argument("--solver", type=str, default="ortools") + parser.add_argument("--output_file", type=str, default=None) + parser.add_argument("--verbose", action="store_true", default=False) + parser.add_argument("--time_limit", type=float, default=None) + parser.add_argument("--mem_limit", type=int, default=None) + parser.add_argument("--seed", type=int, default=None) + parser.add_argument("--intermediate", action="store_true", default=False) + parser.add_argument("--cores", type=int, default=None) + parser.add_argument("--observers", type=list[str], default=None) + return parser + + def print_comment(self, comment: str): + pass + + def register_observer(self, observer): + """Register an observer to be added when run() is called.""" + self.additional_observers.append(observer) + + def get_additional_observers(self): + """Get the list of additional observers that should be registered.""" + return self.additional_observers + diff --git a/cpmpy/tools/benchmark/test/main.py b/cpmpy/tools/benchmark/test/main.py new file mode 100644 index 000000000..bb1f7a527 --- /dev/null +++ b/cpmpy/tools/benchmark/test/main.py @@ -0,0 +1,46 @@ +import subprocess +import os +import sys +from concurrent.futures import ThreadPoolExecutor + +def main(): + from cpmpy.tools.dataset.problem.psplib import PSPLibDataset + from cpmpy.tools.rcpsp import read_rcpsp + + # dataset = XCSP3Dataset(root="./data", year=2025, track="CSP25", download=True) + # dataset = OPBDataset(root="./data", year=2024, track="DEC-LIN", download=True) + # dataset = JSPLibDataset(root="./data", download=True) + dataset = PSPLibDataset(root="./data", download=True) + + time_limit = 10 + workers = 10 + + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = [executor.submit(run_instance, instance, metadata, time_limit) for instance, metadata in dataset] + for future in futures: + future.result() + +def run_instance(instance, metadata, time_limit): + this_file_path = os.path.dirname(os.path.abspath(__file__)) + this_python = sys.executable + cmd_runexec = [ + "runexec", + "--walltimelimit", f"{time_limit}s", + "--no-container", + "--" + ] + cmd = cmd_runexec + [ + this_python, os.path.join(this_file_path, "runner.py"), + instance, + "--solver", "ortools", + "--time_limit", str(time_limit), + "--seed", "1234567890", + "--intermediate", + "--cores", "1" + ] + print(" ".join(cmd)) + subprocess.run(cmd) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cpmpy/tools/benchmark/test/manager.py b/cpmpy/tools/benchmark/test/manager.py new file mode 100644 index 000000000..1ded83305 --- /dev/null +++ b/cpmpy/tools/benchmark/test/manager.py @@ -0,0 +1,438 @@ +import os +import sys +import argparse +import signal +import importlib +import importlib.util +import contextlib +import warnings +import logging +from pathlib import Path + +from cpmpy.tools.benchmark.test.instance_runner import InstanceRunner +from cpmpy.tools.benchmark.test.xcsp3_instance_runner import XCSP3InstanceRunner +from cpmpy.tools.benchmark.test.runner import ResourceLimitObserver + + +class ResourceManager: + pass + +class RunExecResourceManager: + + @contextlib.contextmanager + def _print_forwarding_context(self, runner: InstanceRunner): + """Context manager that forwards all print statements, warnings, and logging to runner.print_comment.""" + class PrintForwarder: + def __init__(self, runner, is_stderr=False): + self.runner = runner + self.original_stream = sys.stderr if is_stderr else sys.stdout + self.is_stderr = is_stderr + self.buffer = [] + # Track if we're in a logging handler to avoid duplicates + self._in_logging_handler = False + + def _is_from_benchexec(self): + """Check if the current call stack includes benchexec code.""" + import inspect + frame = None + try: + # Skip the current frame (write) and the caller frame + frame = inspect.currentframe() + if frame and frame.f_back: + frame = frame.f_back.f_back # Skip write and its immediate caller + while frame: + module_name = frame.f_globals.get('__name__', '') + if 'benchexec' in module_name: + return True + frame = frame.f_back + return False + except Exception: + # If inspection fails, err on the side of forwarding + return False + finally: + # Explicitly delete frame reference to avoid reference cycles + if frame is not None: + del frame + + def write(self, text): + # Skip forwarding if output is coming from benchexec/RunExecutor + if self._is_from_benchexec(): + # Just write to original stream, don't forward + self.original_stream.write(text) + return + + # Skip forwarding if this is stderr and looks like a logging message + # (logging handler will forward it instead) + if self.is_stderr and text.strip(): + # Check if this looks like a logging message (starts with log level) + first_line = text.split('\n')[0].strip() + if first_line.startswith(('WARNING:', 'ERROR:', 'CRITICAL:', 'INFO:', 'DEBUG:')): + # This is a logging message, don't forward (logging handler will handle it) + self.original_stream.write(text) + return + + # Forward immediately line by line for real-time forwarding + if text: + # Split by newlines and forward each complete line + lines = text.split('\n') + # If text doesn't end with newline, the last part is incomplete + if text.endswith('\n'): + # All lines are complete + for line in lines[:-1]: # Last element is empty string + if line.strip(): + self.runner.print_comment(line.rstrip()) + else: + # Forward complete lines, buffer incomplete line + for line in lines[:-1]: + if line.strip(): + self.runner.print_comment(line.rstrip()) + # Buffer the incomplete line + self.buffer.append(lines[-1]) + # Also write to original stream to preserve normal behavior + self.original_stream.write(text) + + def flush(self): + self.original_stream.flush() + + def forward_to_runner(self): + # Forward any remaining buffered output + if self.buffer: + full_text = ''.join(self.buffer) + if full_text.strip(): + self.runner.print_comment(full_text.rstrip()) + self.buffer = [] + + class LoggingHandler(logging.Handler): + """Custom logging handler that forwards log messages to runner.""" + def __init__(self, runner): + super().__init__() + self.runner = runner + # Use a simple format similar to default logging format + self.setFormatter(logging.Formatter('%(levelname)s:%(name)s:%(message)s')) + # Prevent propagation to avoid duplicate messages in stderr + self.propagate = False + + def emit(self, record): + try: + # Format the log message + log_msg = self.format(record) + # Forward to runner + self.runner.print_comment(log_msg) + except Exception: + # Ignore errors in logging handler to avoid recursion + pass + + def warning_handler(message, category, filename, lineno, file=None, line=None): + """Custom warning handler that forwards warnings to runner.""" + # Format the warning message + warning_msg = f"{category.__name__}: {str(message).rstrip()}" + # Forward to runner + runner.print_comment(warning_msg) + # Also call the original warning handler to preserve normal behavior + original_showwarning(message, category, filename, lineno, file, line) + + stdout_forwarder = PrintForwarder(runner, is_stderr=False) + stderr_forwarder = PrintForwarder(runner, is_stderr=True) + logging_handler = LoggingHandler(runner) + logging_handler.setLevel(logging.WARNING) # Only capture WARNING and above + + # Get root logger and benchexec logger + root_logger = logging.getLogger() + benchexec_logger = logging.getLogger('benchexec') + original_root_handlers = root_logger.handlers[:] + original_root_level = root_logger.level + original_root_propagate = root_logger.propagate + original_benchexec_handlers = benchexec_logger.handlers[:] + original_benchexec_level = benchexec_logger.level + original_benchexec_propagate = benchexec_logger.propagate + + # Find and temporarily remove stderr handlers to prevent duplicates + # (logging handlers write to stderr, which our stderr forwarder would also capture) + # Store original stderr reference before redirecting + original_stderr = sys.stderr + stderr_handlers_to_remove = [] + for handler in root_logger.handlers: + if isinstance(handler, logging.StreamHandler) and handler.stream == original_stderr: + stderr_handlers_to_remove.append(handler) + for handler in stderr_handlers_to_remove: + root_logger.removeHandler(handler) + + benchexec_stderr_handlers_to_remove = [] + for handler in benchexec_logger.handlers: + if isinstance(handler, logging.StreamHandler) and handler.stream == original_stderr: + benchexec_stderr_handlers_to_remove.append(handler) + for handler in benchexec_stderr_handlers_to_remove: + benchexec_logger.removeHandler(handler) + + original_showwarning = warnings.showwarning + + try: + # Redirect stdout and stderr + sys.stdout = stdout_forwarder + sys.stderr = stderr_forwarder + # Redirect warnings + warnings.showwarning = warning_handler + # Temporarily disable propagation to prevent duplicate messages + root_logger.propagate = False + benchexec_logger.propagate = False + # Disable lastResort handler (Python 3.2+) to prevent fallback to stderr + if hasattr(logging, 'lastResort'): + original_last_resort = logging.lastResort + logging.lastResort = None + else: + original_last_resort = None + # Add logging handler to both root and benchexec loggers + root_logger.addHandler(logging_handler) + root_logger.setLevel(logging.WARNING) # Ensure we capture warnings + benchexec_logger.addHandler(logging_handler) + benchexec_logger.setLevel(logging.WARNING) + yield + finally: + # Restore lastResort handler if we disabled it + if original_last_resort is not None: + logging.lastResort = original_last_resort + # Restore stdout and stderr + sys.stdout = stdout_forwarder.original_stream + sys.stderr = stderr_forwarder.original_stream + # Restore warnings + warnings.showwarning = original_showwarning + # Remove our logging handler + root_logger.removeHandler(logging_handler) + benchexec_logger.removeHandler(logging_handler) + # Restore original handlers (including stderr handlers) + root_logger.handlers = original_root_handlers + root_logger.setLevel(original_root_level) + root_logger.propagate = original_root_propagate + benchexec_logger.handlers = original_benchexec_handlers + benchexec_logger.setLevel(original_benchexec_level) + benchexec_logger.propagate = original_benchexec_propagate + # Forward any remaining buffered output + stdout_forwarder.forward_to_runner() + stderr_forwarder.forward_to_runner() + + def run(self, instance: str, runner: InstanceRunner, time_limit: float, memory_limit: int, cores: list[int]): + + runner.print_comment(f"Running instance {instance} with time limit {time_limit} and memory limit {memory_limit} and cores {cores}") + runner.print_comment(f"Running with manager {self.__class__.__name__}") + + from benchexec.runexecutor import RunExecutor + + # Use a temporary file to capture subprocess output, then forward it + import tempfile + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.log') as tmp_file: + tmp_filename = tmp_file.name + + try: + # Capture warnings from benchexec itself (current process) and subprocess output + # Set up forwarding context BEFORE creating executor to catch all warnings + with self._print_forwarding_context(runner): + executor = RunExecutor( + use_namespaces=False, + ) + + def signal_handler_kill(signum, frame): + executor.stop() + + signal.signal(signal.SIGTERM, signal_handler_kill) + signal.signal(signal.SIGQUIT, signal_handler_kill) + signal.signal(signal.SIGINT, signal_handler_kill) + + cmd = runner.cmd(instance) + if time_limit is not None: + cmd.append("--time_limit") + cmd.append(str(time_limit)) + + cmd += [ + "--seed", "1234567890", + "--intermediate", + #"--cores", str(len(cores)) # Pass number of cores to the solver + ] + + result = executor.execute_run( + args=cmd, + output_filename=tmp_filename, # Capture subprocess output to temp file + # stdin=stdin, + # hardtimelimit=options.timelimit, + # softtimelimit=options.softtimelimit, + walltimelimit=time_limit, + cores=cores, + memlimit=memory_limit, + # memory_nodes=options.memoryNodes, + # cgroupValues=cgroup_values, + # workingDir=options.dir, + # maxLogfileSize=options.maxOutputSize, + # files_count_limit=options.filesCountLimit, + # files_size_limit=options.filesSizeLimit, + write_header=False, + # **container_output_options, + ) + + # Read the output file and forward subprocess output to runner + # Filter out RunExecutor-specific messages that get mixed into subprocess output + def _is_runexec_message(line): + """Check if a line is a RunExecutor-specific message that should be filtered.""" + line_lower = line.lower().strip() + # Filter specific RunExecutor warning patterns (very specific to avoid false positives) + runexec_patterns = [ + 'warning: no variables in this model (and so, no generated file)', + 'warning: no variables in this model', + ] + return any(pattern in line_lower for pattern in runexec_patterns) + + try: + with open(tmp_filename, 'r', encoding='utf-8', errors='replace') as f: + for line in f: + line_stripped = line.strip() + # Skip empty lines and RunExecutor messages + if line_stripped and not _is_runexec_message(line_stripped): + # Subprocess output is already formatted by the runner's observers, + # so print it directly without wrapping in print_comment to avoid double-prefixing + print(line_stripped, flush=True) + except FileNotFoundError: + # Output file might not exist if process was killed before writing + pass + finally: + # Clean up temp file + try: + os.unlink(tmp_filename) + except Exception: + pass + + runner.print_comment(f"RunExec result: {result}") + + if "terminationreason" in result: + reason = result["terminationreason"] + if reason == "memory": + runner.print_comment("Memory limit exceeded") + elif reason == "walltime": + runner.print_comment("Wall time limit exceeded") + +class PythonResourceManager: + + def run(self, instance: str, runner: InstanceRunner, time_limit: int, memory_limit: int, cores: list[int]): + # Programmatically add ResourceLimitObserver if limits are provided + if time_limit is not None or memory_limit is not None: + # Add a resource observer with limits + resource_observer = ResourceLimitObserver( + time_limit=time_limit if time_limit is not None else None, + mem_limit=memory_limit if memory_limit is not None else None + ) + runner.register_observer(resource_observer) + + # Run the instance using the runner's run method + runner.run(instance=instance, time_limit=time_limit, mem_limit=memory_limit, cores=len(cores) if cores else None) + + + + +def run_instance(instance: str, instance_runner: InstanceRunner, time_limit: int, memory_limit: int, cores: list[int], resource_manager: ResourceManager): + + + """ + Run a single instance with assigned cores. + + Args: + instance: Instance file path + time_limit: Time limit in seconds + memory_limit: Memory limit in MB + cores: List of core IDs to assign to this run (e.g., [0, 1] for cores 0 and 1) + """ + + + resource_manager.run(instance, instance_runner, time_limit, memory_limit, cores) + + + # Convert cores list to comma-separated string for runexec + #cores_str = ",".join(map(str, cores)) + + # cmd_runexec = [ + # "runexec", + # "--walltimelimit", f"{time_limit}s", + # "--memlimit", f"{memory_limit}MB", + # "--no-container", + # "--cores", cores_str, + # "--" + # ] + + + + +def load_instance_runner(runner_path: str) -> InstanceRunner: + """ + Load an instance runner class from a module path. + + Args: + runner_path: Path to the instance runner class, e.g., + "cpmpy.tools.benchmark.test.xcsp3_instance_runner.XCSP3InstanceRunner" + or a file path like "/path/to/module.py:ClassName" + + Returns: + InstanceRunner instance + """ + if ":" in runner_path: + # Format: /path/to/module.py:ClassName + file_path, class_name = runner_path.rsplit(":", 1) + file_path = Path(file_path).resolve() + + # Add parent directory to sys.path if needed + parent_dir = str(file_path.parent) + if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) + + # Import the module + module_name = file_path.stem + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # Get the class + runner_class = getattr(module, class_name) + elif "." in runner_path: + # Format: module.path.ClassName + module_path, class_name = runner_path.rsplit(".", 1) + module = importlib.import_module(module_path) + runner_class = getattr(module, class_name) + else: + # Default to xcsp3 if just a name + if runner_path == "xcsp3": + return XCSP3InstanceRunner() + else: + raise ValueError(f"Invalid runner path format: {runner_path}. Use 'module.path.ClassName' or '/path/to/file.py:ClassName'") + + if not issubclass(runner_class, InstanceRunner): + raise ValueError(f"{runner_class} is not a subclass of InstanceRunner") + + return runner_class() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--instance", type=str, required=True) + parser.add_argument("--time_limit", type=float, required=False, default=None) + parser.add_argument("--memory_limit", type=int, required=False, default=None) + parser.add_argument("--cores", type=list[int], required=False, default=None) + parser.add_argument("--runner", type=str, required=False, default="xcsp3", + help="Path to instance runner class. Can be:\n" + "- 'xcsp3' (default)\n" + "- Module path: 'cpmpy.tools.benchmark.test.xcsp3_instance_runner.XCSP3InstanceRunner'\n" + "- File path: '/path/to/module.py:ClassName'") + parser.add_argument("--resource_manager", type=str, required=False, default="runexec") + args = parser.parse_args() + + if args.resource_manager == "runexec": + resource_manager = RunExecResourceManager() + elif args.resource_manager == "python": + resource_manager = PythonResourceManager() + else: + raise ValueError(f"Invalid resource manager: {args.resource_manager}") + + # Load the instance runner + if args.runner == "xcsp3": + instance_runner = XCSP3InstanceRunner() + else: + instance_runner = load_instance_runner(args.runner) + + resource_manager.run(args.instance, instance_runner, args.time_limit, args.memory_limit, args.cores) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cpmpy/tools/benchmark/test/run_xcsp3.py b/cpmpy/tools/benchmark/test/run_xcsp3.py new file mode 100644 index 000000000..c66e57d55 --- /dev/null +++ b/cpmpy/tools/benchmark/test/run_xcsp3.py @@ -0,0 +1,58 @@ +""" +Deprecated: Use XCSP3InstanceRunner instead +""" +import argparse +import lzma +from pathlib import Path +from functools import partial + +from cpmpy.tools.benchmark.test.runner import Runner, CompetitionPrintingObserver, ProfilingObserver, HandlerObserver, SolverArgsObserver, SolutionCheckerObserver, WriteToFileObserver +from cpmpy.tools.dataset.model.xcsp3 import XCSP3Dataset +from cpmpy.tools.xcsp3 import read_xcsp3 + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("instance", type=str) + parser.add_argument("--verbose", action="store_true", default=False) + parser.add_argument("--solver", type=str, default="ortools") + parser.add_argument("--time_limit", type=int, default=None) + parser.add_argument("--mem_limit", type=int, default=None) + parser.add_argument("--seed", type=int, default=None) + parser.add_argument("--intermediate", action="store_true", default=False) + parser.add_argument("--cores", type=int, default=None) + parser.add_argument("--output_file", type=str, default=None) + # parser.add_argument("--kwargs", type=str, default="") + parser.add_argument("--observers", type=list[str], default=None) + + args = parser.parse_args() + + + if args.output_file is None: + args.output_file = f"results/{args.solver}_{args.instance}.txt" + else: + args.output_file = f"results/{args.output_file}" + + Path(args.output_file).parent.mkdir(parents=True, exist_ok=True) + + + # dataset = XCSP3Dataset(root="./data", year=2024, track="CSP24", download=True) + + runner = Runner(reader=partial(read_xcsp3, open= lambda instance: lzma.open(instance, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open(instance))) + # runner.register_observer(LoggerObserver()) + runner.register_observer(CompetitionPrintingObserver()) + runner.register_observer(ProfilingObserver()) + # runner.register_observer(ResourceLimitObserver(time_limit=args.time_limit, mem_limit=args.mem_limit)) + runner.register_observer(HandlerObserver()) + runner.register_observer(SolverArgsObserver()) + runner.register_observer(SolutionCheckerObserver()) + #runner.register_observer(WriteToFileObserver(file_path=args.output_file)) + + for observer in args.observers: + pass + + + print(vars(args)) + runner.run(**vars(args)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cpmpy/tools/benchmark/test/runner.py b/cpmpy/tools/benchmark/test/runner.py new file mode 100644 index 000000000..a3ff3ee6c --- /dev/null +++ b/cpmpy/tools/benchmark/test/runner.py @@ -0,0 +1,933 @@ +from abc import ABC, abstractmethod + +import psutil +from cpmpy.model import Model +import logging +import signal +import argparse +import sys +import warnings +import os +import time +from pathlib import Path +from typing import Optional +from functools import partial +import contextlib +import cpmpy as cp +from cpmpy.solvers import solver_interface +from cpmpy.tools.benchmark import set_time_limit, set_memory_limit + +from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus +from cpmpy.tools.benchmark.opb import solution_opb +from cpmpy.tools.benchmark import _mib_as_bytes, _wall_time, set_memory_limit, set_time_limit, _bytes_as_mb, _bytes_as_gb, disable_memory_limit + + +class Runner: + + def __init__(self, reader: callable): + self.observers = [] + self.solver_args = {} + self.reader = reader + + def register_observer(self, observer): + self.observers.append(observer) + + def read_instance(self, instance: str): + return self.reader(instance) + + def post_model(self, model: cp.Model, solver:str): + return cp.SolverLookup.get(solver, model) + + def run(self, instance: str, solver: Optional[str] = None, time_limit: Optional[int] = None, mem_limit: Optional[int] = None, seed: Optional[int] = None, intermediate: bool = False, cores: int = 1, **kwargs): + self.solver = solver + self.time_limit = time_limit + self.mem_limit = mem_limit + self.seed = seed + self.intermediate = intermediate + self.cores = cores + self.kwargs = kwargs + self.time_buffer = 1 + self.verbose = True + + with self.observer_context(): + self.observe_init() + + with self.print_forwarding_context(): + self.model = self.read_instance(instance) + + + self.observe_pre_transform() + with self.print_forwarding_context(): + self.s = self.post_model(self.model, solver) + self.observe_post_transform() + + self.solver_args = self.participate_solver_args() + + if self.time_limit: + # Get the current process + p = psutil.Process() + + # give solver only the remaining time + time_limit = self.time_limit - _wall_time(p) - self.time_buffer + if self.verbose: self.print_comment(f"{time_limit}s left to solve") + + else: + time_limit = None + + if time_limit is not None: + if time_limit < 0: + raise TimeoutError(f"Time limit of {self.time_limit} seconds reached") + + + self.observe_pre_solve() + with self.print_forwarding_context(): + self.is_sat = self.s.solve(time_limit = time_limit, **self.solver_args) + self.observe_post_solve() + + # Check if solver timed out (UNKNOWN status with time limit set) + if time_limit is not None and self.s.status().exitstatus == CPMStatus.UNKNOWN: + # Check if we're near the time limit (within 2 seconds) + p = psutil.Process() + elapsed = _wall_time(p) + if elapsed >= self.time_limit - 2: + self.print_comment(f"Timeout: Solver reached time limit of {self.time_limit} seconds (elapsed: {elapsed:.2f}s)") + + self.observe_end() + + #print(self.is_sat) + return self.is_sat + + def print_comment(self, comment: str): + for observer in self.observers: + observer.print_comment(comment) + + @contextlib.contextmanager + def print_forwarding_context(self): + """Context manager that forwards all print statements and warnings to observers.""" + class PrintForwarder: + def __init__(self, runner): + self.runner = runner + self.original_stdout = sys.stdout + self.original_stderr = sys.stderr + self.buffer = [] + + def write(self, text): + # Buffer the output + self.buffer.append(text) + # Also write to original stdout to preserve normal behavior + self.original_stdout.write(text) + + def flush(self): + self.original_stdout.flush() + + def forward_to_observers(self): + # Forward buffered output to observers line by line + if self.buffer: + full_text = ''.join(self.buffer) + for line in full_text.splitlines(keepends=True): + if line.strip(): # Only forward non-empty lines + self.runner.print_comment(line.rstrip()) + + def warning_handler(message, category, filename, lineno, file=None, line=None): + """Custom warning handler that forwards warnings to observers.""" + # Format the warning message + warning_msg = f"{category.__name__}: {str(message).rstrip()}" + # Forward to observers + self.print_comment(warning_msg) + # Also call the original warning handler to preserve normal behavior + original_showwarning(message, category, filename, lineno, file, line) + + forwarder = PrintForwarder(self) + original_showwarning = warnings.showwarning + + try: + # Redirect stdout and stderr + sys.stdout = forwarder + sys.stderr = forwarder + # Redirect warnings + warnings.showwarning = warning_handler + yield + finally: + # Restore stdout and stderr + sys.stdout = forwarder.original_stdout + sys.stderr = forwarder.original_stderr + # Restore warnings + warnings.showwarning = original_showwarning + # Forward any remaining buffered output + forwarder.forward_to_observers() + + def observer_context(self): + return ObserverContext(observers=self.observers, runner=self) + + def observe_init(self): + for observer in self.observers: + observer.observe_init(runner=self) + + def observe_pre_transform(self): + for observer in self.observers: + observer.observe_pre_transform(runner=self) + + def observe_post_transform(self): + for observer in self.observers: + observer.observe_post_transform(runner=self) + + + def observe_pre_solve(self): + for observer in self.observers: + observer.observe_pre_solve(runner=self) + + def observe_post_solve(self): + for observer in self.observers: + observer.observe_post_solve(runner=self) + + def observe_end(self): + for observer in self.observers: + observer.observe_end(runner=self) + + def participate_solver_args(self): + solver_args = {} + for observer in self.observers: + observer.participate_solver_args(runner=self, solver_args=solver_args) + return solver_args + +class ObserverContext: + def __init__(self, observers: list, runner: Runner): + self.observers = observers or [] + self.runner = runner + self.exit_stack = contextlib.ExitStack() + + def __enter__(self): + # Enter all context managers from observers + if self.observers: + for observer in self.observers: + cm = observer.get_context_manager(runner=self.runner) + if cm is not None: + self.exit_stack.enter_context(cm) + return self + + def __exit__(self, exc_type, exc_value, traceback): + # First, exit all context managers (in reverse order) + # This happens automatically when we exit the ExitStack + exit_result = None + if self.exit_stack: + exit_result = self.exit_stack.__exit__(exc_type, exc_value, traceback) + + if exc_type is not None and self.observers: + # An exception occurred, notify all observers + # Let ResourceLimitObserver handle it and decide if exception should be suppressed + suppress_exception = True + for observer in self.observers: + try: + # Pass exception to observer, let it handle it + result = observer.observe_exception(runner=self.runner, exc_type=exc_type, exc_value=exc_value, traceback=traceback) + # If observer returns True, it wants to suppress the exception + if result is True: + suppress_exception = True + except Exception: + # Don't let observer exceptions mask the original exception + pass + + # If any observer wants to suppress, suppress the exception + if suppress_exception: + return True + + # Always call observe_exit on all observers + if self.observers: + for observer in self.observers: + try: + observer.observe_exit(runner=self.runner) + except Exception: + # Don't let observer exceptions interfere with cleanup + pass + + # Return the exit result from ExitStack (False to propagate, True to suppress) + return exit_result if exit_result is not None else False + +class Observer(ABC): + + def observe_init(self, runner: Runner): + pass + + def observe_pre_transform(self, runner: Runner): + pass + + def observe_post_transform(self, runner: Runner): + pass + + def observe_pre_solve(self, runner: Runner): + pass + + def observe_post_solve(self, runner: Runner): + pass + + def participate_solver_args(self, runner: Runner, solver_args: dict): + return solver_args + + def observe_exception(self, runner: Runner, exc_type, exc_value, traceback): + """ + Called when an exception occurs in the context. + + Returns: + True if the exception should be suppressed, False/None to propagate it. + """ + pass + + def observe_exit(self, runner: Runner): + pass + + def observe_end(self, runner: Runner): + pass + + def print_comment(self, comment: str): + pass + + def observe_intermediate(self, runner: Runner, objective: int): + pass + + def get_context_manager(self, runner: Runner): + """ + Return a context manager that will be entered when the ObserverContext is entered. + Return None if this observer doesn't provide a context manager. + """ + return None + +class HandlerObserver(Observer): + + def __init__(self): + self.runner = None + + def observe_init(self, runner: Runner): + self.runner = runner + signal.signal(signal.SIGINT, self._sigterm_handler) + signal.signal(signal.SIGTERM, self._sigterm_handler) + signal.signal(signal.SIGINT, self._sigterm_handler) + signal.signal(signal.SIGABRT, self._sigterm_handler) + if sys.platform != "win32": + signal.signal(signal.SIGXCPU, self._rlimit_cpu_handler) + else: + warnings.warn("Windows does not support setting SIGXCPU signal") + + def _sigterm_handler(self, _signo, _stack_frame): + exit_code = self.handle_sigterm() + print(flush=True) + os._exit(exit_code) + + def _rlimit_cpu_handler(self, _signo, _stack_frame): + # Raise TimeoutError - ObserverContext will handle notifying observers + # Don't notify here to avoid duplicates + raise TimeoutError("CPU time limit reached (SIGXCPU)") + + def handle_sigterm(self): + return 0 + + def handle_rlimit_cpu(self): + return 0 + +class LoggerObserver(Observer): + def __init__(self): + self.logger = logging.getLogger(__name__) + self.logger.setLevel(logging.INFO) + # Add a StreamHandler to output to stdout if no handlers exist + if not self.logger.handlers: + handler = logging.StreamHandler() + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + self.logger.addHandler(handler) + + def observe_init(self, runner: Runner): + self.logger.info("Initializing runner") + + def observe_pre_transform(self, runner: Runner): + self.logger.info("Pre-transforming") + + def observe_post_transform(self, runner: Runner): + self.logger.info("Post-transforming") + + def observe_pre_solve(self, runner: Runner): + self.logger.info("Pre-solving") + + def observe_post_solve(self, runner: Runner): + self.logger.info("Post-solving") + + def print_comment(self, comment: str): + self.logger.info(comment) + +class CompetitionPrintingObserver(Observer): + + def __init__(self, verbose: bool = False): + self.verbose = verbose + + def print_comment(self, comment: str): + print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) + + def observe_post_solve(self, runner: Runner): + self.print_result(runner.s) + + def observe_intermediate(self, objective: int): + self.print_intermediate(objective) + + def print_status(self, status: str): + print('s' + chr(32) + status, end="\n", flush=True) + + def print_value(self, value: str): + print('v' + chr(32) + value, end="\n", flush=True) + + def print_objective(self, objective: int): + print('o' + chr(32) + str(objective), end="\n", flush=True) + + def print_intermediate(self, objective: int): + self.print_objective(objective) + + def print_result(self, s): + if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_objective(s.objective_value()) + self.print_value(solution_opb(s)) + self.print_status("OPTIMAL" + chr(32) + "FOUND") + elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_objective(s.objective_value()) + self.print_value(solution_opb(s)) + self.print_status("SATISFIABLE") + elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: + self.print_status("UNSATISFIABLE") + else: + self.print_comment("Solver did not find any solution within the time/memory limit") + self.print_status("UNKNOWN") + +class ResourceLimitObserver(Observer): + def __init__(self, time_limit: Optional[int] = None, mem_limit: Optional[int] = None): + self.time_limit = time_limit + self.mem_limit = mem_limit + + def observe_init(self, runner: Runner): + if self.time_limit is not None: + set_time_limit(self.time_limit) + if self.mem_limit is not None: + set_memory_limit(self.mem_limit) + + def _handle_memory_error(self, runner: Runner, mem_limit: int): + runner.print_comment(f"MemoryError raised. Reached limit of {mem_limit} MiB") + + def _handle_timeout(self, runner: Runner, time_limit: int): + if time_limit is not None: + runner.print_comment(f"TimeoutError raised. Reached limit of {time_limit} seconds") + else: + runner.print_comment(f"TimeoutError raised. CPU time limit reached") + + def observe_exception(self, runner: Runner, exc_type, exc_value, traceback): + """ + Handle exceptions related to resource limits. + Returns True to suppress the exception after handling. + """ + if exc_type is MemoryError: + # Only handle if we have a memory limit set + if self.mem_limit is not None: + self._handle_memory_error(runner=runner, mem_limit=self.mem_limit) + return True # Suppress the exception after handling + elif exc_type is TimeoutError: + # Only handle if we have a time limit set + if self.time_limit is not None: + self._handle_timeout(runner=runner, time_limit=self.time_limit) + return True # Suppress the exception after handling + return False # Don't suppress other exceptions + + +class SolverArgsObserver(Observer): + + def __init__(self): + self.time_limit = None + self.mem_limit = None + self.seed = None + self.intermediate = False + self.cores = 1 + self.mem_limit = None + self.kwargs = dict() + + def observe_init(self, runner: Runner): + self.time_limit = runner.time_limit + self.mem_limit = runner.mem_limit + self.seed = runner.seed + self.intermediate = runner.intermediate + self.cores = runner.cores + self.mem_limit = runner.mem_limit + self.kwargs = runner.kwargs + + def _ortools_arguments( + self, + runner: Runner, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + # https://github.com/google/or-tools/blob/stable/ortools/sat/sat_parameters.proto + res = dict() + + # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 + res |= { + "interleave_search": True, + "use_rins_lns": False, + } + if not model.has_objective(): + res |= { "num_violation_ls": 1 } + + if cores is not None: + res |= { "num_search_workers": cores } + if seed is not None: + res |= { "random_seed": seed } + + if intermediate and model.has_objective(): + # Define custom ORT solution callback, then register it + _self = self + from ortools.sat.python import cp_model as ort + class OrtSolutionCallback(ort.CpSolverSolutionCallback): + """ + For intermediate objective printing. + """ + + def __init__(self): + super().__init__() + self.__start_time = time.time() + self.__solution_count = 1 + + def on_solution_callback(self): + """Called on each new solution.""" + + current_time = time.time() + obj = int(self.ObjectiveValue()) + _self.print_comment('Solution %i, time = %0.4fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.observe_intermediate(runner=runner, objective=obj) + self.__solution_count += 1 + + + def solution_count(self): + """Returns the number of solutions found.""" + return self.__solution_count + + # Register the callback + res |= { "solution_callback": OrtSolutionCallback() } + + def internal_options(solver: "CPM_ortools"): + # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 + solver.ort_solver.parameters.subsolvers.extend(["default_lp", "max_lp", "quick_restart"]) + if not model.has_objective(): + solver.ort_solver.parameters.subsolvers.append("core_or_no_lp") + if len(solver.ort_model.proto.search_strategy) != 0: + solver.ort_solver.parameters.subsolvers.append("fixed") + + return res, internal_options + + def _exact_arguments( + self, + seed: Optional[int] = None, + **kwargs + ): + # Documentation: https://gitlab.com/JoD/exact/-/blob/main/src/Options.hpp?ref_type=heads + res = dict() + if seed is not None: + res |= { "seed": seed } + + return res, None + + def _choco_arguments(self): + # Documentation: https://github.com/chocoteam/pychoco/blob/master/pychoco/solver.py + return {}, None + + def _z3_arguments( + self, + model: cp.Model, + cores: int = 1, + seed: Optional[int] = None, + mem_limit: Optional[int] = None, + **kwargs + ): + # Documentation: https://microsoft.github.io/z3guide/programming/Parameters/ + # -> is outdated, just let it crash and z3 will report the available options + + res = dict() + + if model.has_objective(): + # Opt does not seem to support setting random seed or max memory + pass + else: + # Sat parameters + if cores is not None: + res |= { "threads": cores } # TODO what with hyperthreadding, when more threads than cores + if seed is not None: + res |= { "random_seed": seed } + if mem_limit is not None: + res |= { "max_memory": _bytes_as_mb(mem_limit) } + + return res, None + + def _minizinc_arguments( + self, + solver: str, + cores: Optional[int] = None, + seed: Optional[int] = None, + **kwargs + ): + # Documentation: https://minizinc-python.readthedocs.io/en/latest/api.html#minizinc.instance.Instance.solve + res = dict() + if cores is not None: + res |= { "processes": cores } + if seed is not None: + res |= { "random_seed": seed } + + #if solver.endswith("gecode"): + # Documentation: https://www.minizinc.org/doc-2.4.3/en/lib-gecode.html + #elif solver.endswith("chuffed"): + # Documentation: + # - https://www.minizinc.org/doc-2.5.5/en/lib-chuffed.html + # - https://github.com/chuffed/chuffed/blob/develop/chuffed/core/options.h + + return res, None + + def _gurobi_arguments( + self, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + mem_limit: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + # Documentation: https://www.gurobi.com/documentation/9.5/refman/parameters.html#sec:Parameters + res = dict() + if cores is not None: + res |= { "Threads": cores } + if seed is not None: + res |= { "Seed": seed } + if mem_limit is not None: + res |= { "MemLimit": _bytes_as_gb(mem_limit) } + + if intermediate and model.has_objective(): + + _self = self + + class GurobiSolutionCallback: + def __init__(self, model:cp.Model): + self.__start_time = time.time() + self.__solution_count = 0 + self.model = model + + def callback(self, *args, **kwargs): + current_time = time.time() + model, state = args + + # Callback codes: https://www.gurobi.com/documentation/current/refman/cb_codes.html#sec:CallbackCodes + + from gurobipy import GRB + # if state == GRB.Callback.MESSAGE: # verbose logging + # print_comment("log message: " + str(model.cbGet(GRB.Callback.MSG_STRING))) + if state == GRB.Callback.MIP: # callback from the MIP solver + if model.cbGet(GRB.Callback.MIP_SOLCNT) > self.__solution_count: # do we have a new solution? + + obj = int(model.cbGet(GRB.Callback.MIP_OBJBST)) + _self.print_comment('Solution %i, time = %0.4fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.print_intermediate(obj) + self.__solution_count = model.cbGet(GRB.Callback.MIP_SOLCNT) + + res |= { "solution_callback": GurobiSolutionCallback(model).callback } + + return res, None + + def _cpo_arguments( + self, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + # Documentation: https://ibmdecisionoptimization.github.io/docplex-doc/cp/docplex.cp.parameters.py.html#docplex.cp.parameters.CpoParameters + res = dict() + if cores is not None: + res |= { "Workers": cores } + if seed is not None: + res |= { "RandomSeed": seed } + + if intermediate and model.has_objective(): + from docplex.cp.solver.solver_listener import CpoSolverListener + _self = self + class CpoSolutionCallback(CpoSolverListener): + + def __init__(self): + super().__init__() + self.__start_time = time.time() + self.__solution_count = 1 + + def result_found(self, solver, sres): + current_time = time.time() + obj = sres.get_objective_value() + if obj is not None: + _self.print_comment('Solution %i, time = %0.4fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.print_intermediate(obj) + self.__solution_count += 1 + + def solution_count(self): + """Returns the number of solutions found.""" + return self.__solution_count + + # Register the callback + res |= { "solution_callback": CpoSolutionCallback } + + return res, None + + def _cplex_arguments( + self, + cores: Optional[int] = None, + seed: Optional[int] = None, + **kwargs + ): + res = dict() + if cores is not None: + res |= {"threads": cores} + if seed is not None: + res |= {"randomseed": seed} + + return res, None + + def _hexaly_arguments( + self, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + res = dict() + #res |= {"nb_threads": cores} + #res |= {"seed": seed} + + + if intermediate and model.has_objective(): + # Define custom Hexaly solution callback, then register it + + _self = self + class HexSolutionCallback: + + def __init__(self): + self.__start_time = time.time() + self.__solution_count = 0 + + + def on_solution_callback(self, optimizer, cb_type): + """Called on each new solution.""" + # check if solution with different objective (or if verbose) + current_time = time.time() + obj = optimizer.model.objectives[0] + _self.print_comment('Solution %i, time = %0.4fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.print_intermediate(obj) + self.__solution_count += 1 + + def solution_count(self): + return self.__solution_count + + # Register the callback + res |= { "solution_callback": HexSolutionCallback().on_solution_callback } + + + # def internal_options(solver: "CPM_hexaly"): + # # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 + # #solver.native_model.get_param().set_seed(seed) + # #solver.native_model.get_param().set_nr_threads(cores) + + # _self = self + # class CallbackExample: + # def __init__(self): + # self.last_best_value = 0 + # self.last_best_running_time = 0 + # self.__solution_count = 0 + # self.__start_time = time.time() + + # def my_callback(self, optimizer, cb_type): + # stats = optimizer.statistics + # obj = optimizer.model.objectives[0] + # current_time = time.time() + # #obj = int(self.ObjectiveValue()) + # #obj = optimizer.get_objective_bound(0).value + # if obj.value > self.last_best_value: + # self.last_best_running_time = stats.running_time + # self.last_best_value = obj.value + # self.__solution_count += 1 + + # _self.print_comment('Solution %i, time = %0.4fs' % + # (self.__solution_count, current_time - self.__start_time)) + # _self.print_intermediate(obj.value) + + # optimizer = solver.native_model + # cb = CallbackExample() + # from hexaly.optimizer import HxCallbackType + # optimizer.add_callback(HxCallbackType.TIME_TICKED, cb.my_callback) + + return res, None + + def _solver_arguments( + self, + runner: Runner, + solver: str, + model: cp.Model, + seed: Optional[int] = None, + intermediate: bool = False, + cores: int = 1, + mem_limit: Optional[int] = None, + **kwargs + ): + opt = model.has_objective() + sat = not opt + + if solver == "ortools": + return self._ortools_arguments(runner, model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) + elif solver == "exact": + return self._exact_arguments(seed=seed, **kwargs) + elif solver == "choco": + return self._choco_arguments() + elif solver == "z3": + return self._z3_arguments(model, cores=cores, seed=seed, mem_limit=mem_limit, **kwargs) + elif solver.startswith("minizinc"): # also can have a subsolver + return self._minizinc_arguments(solver, cores=cores, seed=seed, **kwargs) + elif solver == "gurobi": + return self._gurobi_arguments(model, cores=cores, seed=seed, mem_limit=mem_limit, intermediate=intermediate, opt=opt, **kwargs) + elif solver == "cpo": + return self._cpo_arguments(model=model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) + elif solver == "hexaly": + return self._hexaly_arguments(model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) + elif solver == "cplex": + return self._cplex_arguments(cores=cores, **kwargs) + else: + runner.print_comment(f"setting parameters of {solver} is not (yet) supported") + return dict(), None + + def participate_solver_args(self, runner: Runner, solver_args: dict): + args, internal_options = self._solver_arguments(runner, runner.solver, model=runner.model, seed=self.seed, + intermediate=self.intermediate, + cores=self.cores, mem_limit=_mib_as_bytes(self.mem_limit) if self.mem_limit is not None else None, + **self.kwargs) + + if internal_options is not None: + internal_options(runner.s) + solver_args |= args + runner.print_comment(f"Solver arguments: {args}") + +class ProfilingObserver(Observer): + + def __init__(self): + self.start_time = None + self.end_time = None + self.start_transform_time = None + self.end_transform_time = None + + def observe_init(self, runner: Runner): + self.start_time = time.time() + + def observe_pre_transform(self, runner: Runner): + self.start_transform_time = time.time() + + def observe_post_transform(self, runner: Runner): + self.end_transform_time = time.time() + runner.print_comment(f"Time taken to transform: {self.end_transform_time - self.start_transform_time} seconds") + + def observe_post_solve(self, runner: Runner): + runner.print_comment(f"Time taken to solve: {runner.s.status().runtime} seconds") + + def observe_end(self, runner: Runner): + runner.print_comment(f"Total time taken: {time.time() - self.start_time} seconds") + +class SolutionCheckerObserver(Observer): + + def observe_end(self, runner: Runner): + runner.print_comment(f"Run solution checker here...") + +class WriteToFileObserver(Observer): + def __init__(self, file_path: str): + self.file_path = file_path + + def get_context_manager(self, runner: Runner): + """Return a context manager that redirects stdout to a file.""" + @contextlib.contextmanager + def redirect_to_file(): + with open(self.file_path, 'w') as f: + with contextlib.redirect_stdout(f): + yield + return redirect_to_file() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("instance", type=str) + parser.add_argument("--verbose", action="store_true", default=False) + parser.add_argument("--solver", type=str, default="ortools") + parser.add_argument("--time_limit", type=int, default=None) + parser.add_argument("--mem_limit", type=int, default=None) + parser.add_argument("--seed", type=int, default=None) + parser.add_argument("--intermediate", action="store_true", default=False) + parser.add_argument("--cores", type=int, default=None) + parser.add_argument("--output_file", type=str, default=None) + # parser.add_argument("--kwargs", type=str, default="") + + args = parser.parse_args() + + + if args.output_file is None: + args.output_file = f"results/{args.solver}_{args.instance}.txt" + else: + args.output_file = f"results/{args.output_file}" + + Path(args.output_file).parent.mkdir(parents=True, exist_ok=True) + + + from cpmpy.tools.rcpsp import read_rcpsp + from cpmpy.tools.dataset.problem.psplib import PSPLibDataset + dataset = PSPLibDataset(root="./data", download=True) + + runner = Runner(reader=partial(read_rcpsp, open=dataset.open)) + # runner.register_observer(LoggerObserver()) + runner.register_observer(CompetitionPrintingObserver()) + runner.register_observer(ProfilingObserver()) + # runner.register_observer(ResourceLimitObserver(time_limit=args.time_limit, mem_limit=args.mem_limit)) + runner.register_observer(HandlerObserver()) + runner.register_observer(SolverArgsObserver()) + runner.register_observer(SolutionCheckerObserver()) + runner.register_observer(WriteToFileObserver(file_path=args.output_file)) + print(vars(args)) + runner.run(**vars(args)) + +if __name__ == "__main__": + main() + + # from cpmpy.tools.dataset.model.xcsp3 import XCSP3Dataset + # from cpmpy.tools.xcsp3 import read_xcsp3 + + # from cpmpy.tools.dataset.model.opb import OPBDataset + # from cpmpy.tools.opb import read_opb + + # from cpmpy.tools.dataset.problem.jsplib import JSPLibDataset + # from cpmpy.tools.jsplib import read_jsplib + + # from cpmpy.tools.dataset.problem.psplib import PSPLibDataset + # from cpmpy.tools.rcpsp import read_rcpsp + + # # dataset = XCSP3Dataset(root="./data", year=2025, track="CSP25", download=True) + # dataset = OPBDataset(root="./data", year=2024, track="DEC-LIN", download=True) + # dataset = JSPLibDataset(root="./data", download=True) + # dataset = PSPLibDataset(root="./data", download=True) + + # for instance, metadata in dataset: + # print(instance, metadata) + # runner = Runner(reader=partial(read_rcpsp, open=dataset.open)) + # #runner.register_observer(LoggerObserver()) + # runner.register_observer(CompetitionPrintingObserver()) + # runner.register_observer(ProfilingObserver()) + # #runner.register_observer(ResourceLimitObserver(time_limit=10, mem_limit=1024)) + # runner.register_observer(HandlerObserver()) + # runner.register_observer(SolverArgsObserver()) + # runner.register_observer(SolutionCheckerObserver()) + # runner.run(instance, solver="ortools") + + # break \ No newline at end of file diff --git a/cpmpy/tools/benchmark/test/xcsp3_instance_runner.py b/cpmpy/tools/benchmark/test/xcsp3_instance_runner.py new file mode 100644 index 000000000..4c45d1e89 --- /dev/null +++ b/cpmpy/tools/benchmark/test/xcsp3_instance_runner.py @@ -0,0 +1,66 @@ +from functools import partial +import lzma +from pathlib import Path +from cpmpy.tools.benchmark.test.instance_runner import InstanceRunner +import os, sys + +from cpmpy.tools.benchmark.test.runner import CompetitionPrintingObserver, HandlerObserver, ProfilingObserver, ResourceLimitObserver, Runner, SolverArgsObserver, SolutionCheckerObserver +from cpmpy.tools.xcsp3.parser import read_xcsp3 + +class XCSP3InstanceRunner(InstanceRunner): + + this_file_path = os.path.abspath(__file__) + this_python = sys.executable + + def cmd(self, instance: str, solver: str = "ortools", output_file: str = None, **kwargs): + cmd = [ + self.this_python, + self.this_file_path, + instance, + ] + if solver is not None: + cmd.append("--solver") + cmd.append(solver) + if output_file is not None: + cmd.append("--output_file") + cmd.append(output_file) + return cmd + + def print_comment(self, comment: str): + print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) + + def run(self, instance: str, solver: str = "ortools", output_file: str = None, **kwargs): + + if output_file is None: + output_file = f"results/{solver}_{instance}.txt" + else: + output_file = f"results/{output_file}" + + Path(output_file).parent.mkdir(parents=True, exist_ok=True) + + runner = Runner(reader=partial(read_xcsp3, open= lambda instance: lzma.open(instance, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open(instance))) + + runner.register_observer(CompetitionPrintingObserver()) + runner.register_observer(ProfilingObserver()) + runner.register_observer(HandlerObserver()) + runner.register_observer(SolverArgsObserver()) + runner.register_observer(SolutionCheckerObserver()) + runner.register_observer(ResourceLimitObserver()) # Don't enforce any limits, just observe / capture exceptions + + # Register any additional observers that were added programmatically + for observer in self.get_additional_observers(): + runner.register_observer(observer) + + runner.run(instance=instance, solver=solver, output_file=output_file, **kwargs) + + +def main(): + runner = XCSP3InstanceRunner() + + parser = runner.argparser() + args = parser.parse_args() + + runner.run(**vars(args)) + +if __name__ == "__main__": + main() \ No newline at end of file From 5611bf0638dc291e477e57ed3fe504e337777b38 Mon Sep 17 00:00:00 2001 From: Tias Guns Date: Wed, 7 Jan 2026 12:19:59 +0100 Subject: [PATCH 055/152] test_examples: missing comma in skip list --- tests/test_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_examples.py b/tests/test_examples.py index a862e3e34..653d3f454 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -24,7 +24,7 @@ SKIPPED_EXAMPLES = [ "ocus_explanations.py", # waiting for issues to be resolved - "psplib.py" # randomly fails on github due to file creation + "psplib.py", # randomly fails on github due to file creation "nurserostering.py" ] From b1e6793dbffbd57fc1182ce4ae39c670e13ff633 Mon Sep 17 00:00:00 2001 From: Thomas Sergeys <49067410+ThomSerg@users.noreply.github.com> Date: Wed, 7 Jan 2026 12:40:02 +0100 Subject: [PATCH 056/152] Missing packaging in setup.py (#813) 'packaging' was added to `requirements.txt`, but not to `setup.py` --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 7223d890d..a4f5fe313 100644 --- a/setup.py +++ b/setup.py @@ -53,6 +53,7 @@ def get_version(rel_path): 'ortools>=9.9', 'numpy>=1.5', 'setuptools', + 'packaging', # to check solver versions ], extras_require={ # Solvers From 841a8abfb351cdb3523fcee71738ed3793c22595 Mon Sep 17 00:00:00 2001 From: Henk Bierlee Date: Thu, 8 Jan 2026 09:43:47 +0100 Subject: [PATCH 057/152] Expand CNF using encoding back-end (#782) * Expand `to_cnf` transformation using pindakaas back-end * Minimize number of free var clauses (x | ~x) * Bypass `pindakaas` for simple clauses * Handle pindakaas dependency * Fix import * Remove return of `ivarmap` * add encoding parameter to cnf transform * update dimacs writer test * pass encoding arg to cnf from dimacs writer --------- Co-authored-by: Ignace Bleukx --- cpmpy/solvers/pindakaas.py | 51 ++++++++----- cpmpy/tools/dimacs.py | 5 +- cpmpy/transformations/to_cnf.py | 121 ++++++++++++++---------------- tests/test_tocnf.py | 127 ++++++++++++++++++-------------- tests/test_tool_dimacs.py | 11 ++- 5 files changed, 171 insertions(+), 144 deletions(-) diff --git a/cpmpy/solvers/pindakaas.py b/cpmpy/solvers/pindakaas.py index 0a37b417c..2291aa3e1 100755 --- a/cpmpy/solvers/pindakaas.py +++ b/cpmpy/solvers/pindakaas.py @@ -124,6 +124,20 @@ def __init__(self, cpm_model=None, subsolver=None): def native_model(self): return self.pdk_solver + def _int2bool_user_vars(self): + # ensure all vars are known to solver + self.solver_vars(list(self.user_vars)) + + # the user vars are only the Booleans (e.g. to ensure solveAll behaves consistently) + user_vars = set() + for x in self.user_vars: + if isinstance(x, _BoolVarImpl): + user_vars.add(x) + else: + # extends set with encoding variables of `x` + user_vars.update(self.ivarmap[x.name].vars()) + return user_vars + def solve(self, time_limit:Optional[float]=None, assumptions:Optional[List[_BoolVarImpl]]=None): """ Solve the encoded CPMpy model given optional time limit and assumptions, returning whether a solution was found. @@ -138,19 +152,7 @@ def solve(self, time_limit:Optional[float]=None, assumptions:Optional[List[_Bool if time_limit is not None and time_limit <= 0: raise ValueError("Time limit must be positive") - # ensure all vars are known to solver - self.solver_vars(list(self.user_vars)) - - # the user vars are only the Booleans (e.g. to ensure solveAll behaves consistently) - user_vars = set() - for x in self.user_vars: - if isinstance(x, _BoolVarImpl): - user_vars.add(x) - else: - # extends set with encoding variables of `x` - user_vars.update(self.ivarmap[x.name].vars()) - - self.user_vars = user_vars + self.user_vars = self._int2bool_user_vars() if time_limit is not None: time_limit = timedelta(seconds=time_limit) @@ -269,22 +271,33 @@ def add(self, cpm_expr_orig): __add__ = add # avoid redirect in superclass + def _add_clause(self, cpm_expr): + if not isinstance(cpm_expr, list): + raise TypeError + + self.pdk_solver.add_clause(self.solver_vars(cpm_expr)) + def _post_constraint(self, cpm_expr, conditions=[]): + if not isinstance(conditions, list): + raise TypeError + """Add a single, *transformed* constraint, implied by conditions.""" if isinstance(cpm_expr, BoolVal): # base case: Boolean value if cpm_expr.args[0] is False: - self.pdk_solver.add_clause(conditions) + self._add_clause(conditions) elif isinstance(cpm_expr, _BoolVarImpl): # (implied) literal - self.pdk_solver.add_clause(conditions + [self.solver_var(cpm_expr)]) + self._add_clause(conditions + [cpm_expr]) elif cpm_expr.name == "or": # (implied) clause - self.pdk_solver.add_clause(conditions + self.solver_vars(cpm_expr.args)) + self._add_clause(conditions + cpm_expr.args) elif cpm_expr.name == "->": # implication a0, a1 = cpm_expr.args - self._post_constraint(a1, conditions=conditions + [~self.solver_var(a0)]) + if not isinstance(a0, _BoolVarImpl): + raise TypeError + self._post_constraint(a1, conditions=conditions + [~a0]) elif isinstance(cpm_expr, Comparison): # Bool linear assert cpm_expr.name in {"<=", ">=", "=="}, ( @@ -306,7 +319,9 @@ def _post_constraint(self, cpm_expr, conditions=[]): lhs = sum(c * l for c, l in zip(coefficients, self.solver_vars(literals))) - self.pdk_solver.add_encoding(eval_comparison(cpm_expr.name, lhs, rhs), conditions=conditions) + self.pdk_solver.add_encoding( + eval_comparison(cpm_expr.name, lhs, rhs), conditions=self.solver_vars(conditions) + ) else: raise NotSupportedError(f"{self.name}: Unsupported constraint {cpm_expr}") diff --git a/cpmpy/tools/dimacs.py b/cpmpy/tools/dimacs.py index 4a3ef0d2c..19ab8d444 100644 --- a/cpmpy/tools/dimacs.py +++ b/cpmpy/tools/dimacs.py @@ -25,7 +25,7 @@ import re -def write_dimacs(model, fname=None): +def write_dimacs(model, fname=None, encoding="auto"): """ Writes CPMpy model to DIMACS format Uses the "to_cnf" transformation from CPMpy @@ -35,10 +35,11 @@ def write_dimacs(model, fname=None): :param model: a CPMpy model :param fname: optional, file name to write the DIMACS output to + :param encoding: the encoding used for `int2bool`, choose from ("auto", "direct", "order", or "binary") """ constraints = toplevel_list(model.constraints) - constraints = to_cnf(constraints) + constraints = to_cnf(constraints, encoding=encoding) vars = get_variables(constraints) mapping = {v : i+1 for i, v in enumerate(vars)} diff --git a/cpmpy/transformations/to_cnf.py b/cpmpy/transformations/to_cnf.py index 5a03f28e2..2dfb25842 100644 --- a/cpmpy/transformations/to_cnf.py +++ b/cpmpy/transformations/to_cnf.py @@ -1,82 +1,71 @@ """ - Meta-transformation for obtaining a CNF from a list of constraints. +Transform constraints to **Conjunctive Normal Form** (i.e. an `and` of `or`s of literals, i.e. Boolean variables or their negation, e.g. from `x xor y` to `(x or ~y) and (~x or y)`) using a back-end encoding library and its transformation pipeline. +""" - Converts the logical constraints into disjuctions using the tseitin transform, - including flattening global constraints that are :func:`~cpmpy.expressions.core.Expression.is_bool()` and not in `supported`. +import cpmpy as cp +from ..solvers.pindakaas import CPM_pindakaas +from ..transformations.get_variables import get_variables - .. note:: - The transformation is no longer used by the SAT solvers, and may be outdated. - Check :meth:`CPM_pysat.transform ` for an up-to-date alternative. - - Other constraints are copied verbatim so this transformation - can also be used in non-pure CNF settings. - The implementation first converts the list of constraints - to **Flat Normal Form**, this already flattens subexpressions using - auxiliary variables. +def to_cnf(constraints, csemap=None, ivarmap=None, encoding="auto"): + """ + Converts all constraints into **Conjunctive Normal Form** - What is then left to do is to tseitin encode the following into CNF: + Arguments: + constraints: list[Expression] or Operator + csemap: `dict()` used for CSE + ivarmap: `dict()` used to map integer variables to their encoding (usefull for finding the values of the now-encoded integer variables) + encoding: the encoding used for `int2bool`, choose from ("auto", "direct", "order", or "binary") + Returns: + Equivalent CPMpy constraints in CNF, and the updated `ivarmap` + """ + if not CPM_pindakaas.supported(): + raise ImportError( + f"Install the Pindakaas python library `pindakaas` (e.g. `pip install pindakaas`) package to use the `to_cnf` transformation" + ) - - ``BV`` with BV a ``BoolVar`` (or ``NegBoolView``) - - ``or([BV])`` constraint - - ``and([BV])`` constraint - - ``BE != BV`` with ``BE :: BV|or()|and()|BV!=BV|BV==BV|BV->BV`` - - ``BE == BV`` - - ``BE -> BV`` - - ``BV -> BE`` -""" -from ..expressions.core import Operator -from ..expressions.variables import _BoolVarImpl -from .reification import only_implies -from .flatten_model import flatten_constraint + import pindakaas as pdk -def to_cnf(constraints, csemap=None): - """ - Converts all logical constraints into **Conjunctive Normal Form** + slv = CPM_pindakaas() + slv.encoding = encoding - Arguments: - constraints: list[Expression] or Operator - supported: (frozen)set of global constraint names that do not need to be decomposed - """ - fnf = flatten_constraint(constraints, csemap=csemap) - fnf = only_implies(fnf, csemap=csemap) - return flat2cnf(fnf) + if ivarmap is not None: + slv.ivarmap = ivarmap + slv._csemap = csemap -def flat2cnf(constraints): - """ - Converts from **Flat Normal Form** all logical constraints into **Conjunctive Normal Form**, - including flattening global constraints that are :func:`~cpmpy.expressions.core.Expression.is_bool()` and not in `supported`. + # the encoded constraints (i.e. `PB`s) will be added to this `pdk.CNF` object + slv.pdk_solver = pdk.CNF() - What is now left to do is to tseitin encode: + # however, we bypass `pindakaas` for simple clauses + clauses = [] + slv._add_clause = lambda cpm_expr: clauses.append(cp.any(cpm_expr)) - - ``BV`` with BV a ``BoolVar`` (or ``NegBoolView``) - - ``or([BV])`` constraint - - ``and([BV])`` constraint - - ``BE != BV`` with ``BE :: BV|or()|and()|BV!=BV|BV==BV|BV->BV`` - - ``BE == BV`` - - ``BE -> BV`` - - ``BV -> BE`` + # add, transform, and encode constraints into CNF/clauses + slv += constraints - We do it in a principled way for each of the cases. (in)equalities - get transformed into implications, everything is modular. + # now we read the pdk.CNF back to cpmpy constraints by mapping from `pdk.Lit` to CPMpy lit + cpmpy_vars = {str(slv.solver_var(x).var()): x for x in slv._int2bool_user_vars()} - Arguments: - constraints: list[Expression] or Operator - """ - cnf = [] - for expr in constraints: - # BE -> BE - if expr.name == '->': - a0,a1 = expr.args + # if a user variable `x` does not occur in any clause, they should be added as `x | ~x` + free_vars = set(cpmpy_vars.values()) - set(get_variables(clauses)) - # BoolVar() -> BoolVar() - if isinstance(a1, _BoolVarImpl) or \ - (isinstance(a1, Operator) and a1.name == 'or'): - cnf.append(~a0 | a1) - continue + def to_cpmpy_clause(clause): + """Lazily convert `pdk.CNF` to CPMpy.""" + for lit in clause: + x = str(lit.var()) + if x not in cpmpy_vars: + cpmpy_vars[x] = cp.boolvar() + y = cpmpy_vars[x] + try: + free_vars.remove(y) + except KeyError: + pass + if lit.is_negated(): + yield ~y + else: + yield y - # all other cases added as is... - # TODO: we should raise here? is not really CNF... - cnf.append(expr) + clauses += (cp.any(to_cpmpy_clause(clause)) for clause in slv.pdk_solver.clauses()) + clauses += ((x | ~x) for x in free_vars) # add free variables so they are "known" by the CNF - return cnf + return clauses diff --git a/tests/test_tocnf.py b/tests/test_tocnf.py index eefef07bb..efdd1048c 100644 --- a/tests/test_tocnf.py +++ b/tests/test_tocnf.py @@ -1,76 +1,91 @@ import unittest -import numpy as np -from cpmpy import * -from cpmpy.solvers import CPM_ortools +import cpmpy as cp + + from cpmpy.transformations.to_cnf import to_cnf from cpmpy.transformations.get_variables import get_variables -from cpmpy.expressions.core import Operator from cpmpy.expressions.globalconstraints import Xor +from cpmpy.expressions.utils import argvals +from cpmpy.solvers.pindakaas import CPM_pindakaas + +import pytest + +@pytest.mark.skipif(not CPM_pindakaas.supported(), reason="Pindakaas (required for `to_cnf`) not installed") class TestToCnf(unittest.TestCase): def test_tocnf(self): - a,b,c = boolvar(shape=3) + a, b, clause = cp.boolvar(shape=3) + x = cp.intvar(1, 2) + y, z = cp.intvar(0, 1, shape=2) - cases = [a, - a|b, - a&b, - a!=b, - a==b, - a.implies(b), - a.implies(b|c), - a.implies(b&c), - a.implies(b!=c), - a.implies(b==c), - a.implies(b.implies(c)), - (b|c).implies(a), - (b&c).implies(a), - (b!=c).implies(a), - (b==c).implies(a), - (b.implies(c)).implies(a), - Xor([a,b]), - ] + bvs = cp.boolvar(shape=3) + cases = [ + a, + a | b, + a & b, + a != b, + a == b, + a.implies(b), + a.implies(b | clause), + a.implies(b & clause), + a.implies(b != clause), + a.implies(b == clause), + a.implies(b.implies(clause)), + (b | clause).implies(a), + (b & clause).implies(a), + (b != clause).implies(a), + (b == clause).implies(a), + (b.implies(clause)).implies(a), + Xor([a, b]), + cp.sum([2 * x + 3 * y]) <= 4, + cp.sum([2 * x + 3 * y + 5 * z]) <= 6, + cp.sum([2 * cp.intvar(1, 2) + 3 * cp.intvar(0, 1)]) <= 4, + cp.sum([3 * cp.intvar(0, 1)]) <= 4, + (a + b + clause) == 1, + # a * b == 1, # TODO in linearization! + # a * b != 1, + (a + b + clause) != 1, + a + b + clause > 2, + a + b + clause <= 2, + cp.sum(cp.intvar(lb=2, ub=3, shape=3)) <= 3, + ] # test for equivalent solutions with/without to_cnf for case in cases: - vs = cpm_array(get_variables(case)) + vs = cp.cpm_array(get_variables(case)) s1 = self.allsols([case], vs) - s1.sort(axis=0) - s2 = self.allsols(to_cnf(case), vs) - s2.sort(axis=0) - for ss1,ss2 in zip(s1,s2): - self.assertTrue(np.all(ss1 == ss2), (case, s1, s2)) + ivarmap = dict() + cnf = to_cnf(case, ivarmap=ivarmap) - # test for errors in edge cases of to_cnf - bvs = boolvar(shape=3) - ivs = intvar(lb=2, ub=3, shape=3) - edge_cases = [ - # do not consider object as a double implcation, but as a sum - (a + b + c) == 1, - a * b == 1, - a * b != 1, - (a + b + c) != 1, - sum(bvs) > 2, - sum(bvs) <= 2, - sum(ivs) <= 3 - ] + # TODO + # assert ( + # cnf is False + # or isinstance(cnf, _BoolVarImpl) + # or cnf.name == "and" + # and all( + # clause.name == "or" + # and all([is_bool(lit) or isinstance(lit, _BoolVarImpl) for lit in clause.args]) + # for clause in cnf.args + # ) + # ), f"The following was not CNF: {cnf}" - # check for error in edge cases - for case in edge_cases: - cnf = to_cnf(case) - # Expressions should not be decomposed at the to_cnf level! - self.assertEqual(len(cnf), 1) + s2 = self.allsols(cnf, vs, ivarmap=ivarmap) + assert s1 == s2, f"The equivalence check failed for translaton from {case} to {cnf}" - def allsols(self, cons, vs): - sols = [] + def allsols(self, cons, vs, ivarmap=None): + m = cp.Model(cons) + sols = set() - m = CPM_ortools(Model(cons)) - while m.solve(): - sols.append(vs.value()) - m += ~all(vs == vs.value()) + def display(): + if ivarmap: + for x_enc in ivarmap.values(): + x_enc._x._value = x_enc.decode() + sols.add(tuple(argvals(vs))) - return np.array(sols) + m.solveAll(solver="ortools", display=display, solution_limit=100) + assert len(sols) < 100, sols + return sols -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() - diff --git a/tests/test_tool_dimacs.py b/tests/test_tool_dimacs.py index 57e43f587..14d9fcb3c 100644 --- a/tests/test_tool_dimacs.py +++ b/tests/test_tool_dimacs.py @@ -7,7 +7,11 @@ from cpmpy.tools.dimacs import read_dimacs, write_dimacs from cpmpy.transformations.get_variables import get_variables_model from cpmpy.solvers.solver_interface import ExitStatus +from cpmpy.solvers.pindakaas import CPM_pindakaas + + +@pytest.mark.skipif(not CPM_pindakaas.supported(), reason="Pindakaas (required for `to_cnf`) not installed") class CNFTool(unittest.TestCase): def setUp(self) -> None: @@ -59,10 +63,13 @@ def test_write_cnf(self): m += b.implies(~c) m += a <= 0 - cnf_txt = write_dimacs(m) gt_cnf = "p cnf 3 3\n1 2 3 0\n-2 -3 0\n-1 0\n" + gt_clauses = set(gt_cnf.split("\n")[1:]) # skip the p-line - self.assertEqual(cnf_txt, gt_cnf) + cnf_txt = write_dimacs(model=m) + cnf_clauses = set(cnf_txt.split("\n")[1:]) # skip the p-line + + self.assertEqual(cnf_clauses, gt_clauses) def test_missing_p_line(self): From 1ac8f7c20b3fac24781a1442ea54aae9e2cd69de Mon Sep 17 00:00:00 2001 From: Henk Bierlee Date: Fri, 9 Jan 2026 22:17:15 +0100 Subject: [PATCH 058/152] Change `cp.sum(*iterable, **kwargs)` to `cp.sum(iterable, **kwargs)` (#756) This is in line with the Python's `builtins.sum` --- cpmpy/expressions/python_builtins.py | 5 ++--- tests/test_expressions.py | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpmpy/expressions/python_builtins.py b/cpmpy/expressions/python_builtins.py index e2ea536a1..82b824bdb 100644 --- a/cpmpy/expressions/python_builtins.py +++ b/cpmpy/expressions/python_builtins.py @@ -142,15 +142,14 @@ def min(*iterable, **kwargs): return Minimum(iterable) -def sum(*iterable, **kwargs): +def sum(iterable, **kwargs): """ sum() overwrites the python built-in to support decision variables. if iterable does not contain CPMpy expressions, the built-in is called checks if all constants and uses built-in sum() in that case """ - if len(iterable) == 1: - iterable = tuple(iterable[0]) # Fix generator polling + iterable = tuple(iterable) # convert iterable (possibly generator) to tuple if not builtins.any(isinstance(elem, Expression) for elem in iterable): return builtins.sum(iterable, **kwargs) diff --git a/tests/test_expressions.py b/tests/test_expressions.py index a1c2d9174..d86731568 100644 --- a/tests/test_expressions.py +++ b/tests/test_expressions.py @@ -513,7 +513,8 @@ def test_sum(self): self.assertEqual(str(gt), str(cp.sum(self.x))) self.assertEqual(str(gt), str(cp.sum(list(self.x)))) self.assertEqual(str(gt), str(cp.sum(v for v in self.x))) - self.assertEqual(str(gt), str(cp.sum(self.x[0], self.x[1], self.x[2]))) + with self.assertRaises(TypeError): # Python sum does not accept sum(1,2,3) + cp.sum(self.x[0], self.x[1], self.x[2]) def test_max(self): gt = Maximum(self.x) From e30bec7f66b87268be0a6990da10740ca0e818a3 Mon Sep 17 00:00:00 2001 From: Tias Guns Date: Mon, 12 Jan 2026 15:02:18 +0100 Subject: [PATCH 059/152] add hexaly to readme, name it a global opt solver (#767) * add hexaly to readme, name it a global opt solver * update pindakaas info * some more on hexaly * README, solvers: move pindakaas to sat, move pysdd to DD --- README.md | 8 +++++--- cpmpy/solvers/hexaly.py | 5 ++--- docs/index.rst | 17 ++++++++--------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index e599f8ae6..446138016 100644 --- a/README.md +++ b/README.md @@ -40,10 +40,12 @@ Install simply with `pip install cpmpy` CPMpy can translate to a wide variety of constraint solving paradigms, including both commercial and open-source solvers. * **CP Solvers**: OR-Tools (default), IBM CP Optimizer (license required), Choco, Glasgow GCS, Pumpkin, MiniZinc+solvers -* **ILP Solvers**: Gurobi (license required) +* **ILP Solvers**: Gurobi (license required), CPLEX (license required) +* **GO Solvers**: Hexaly (license required) * **SMT Solvers**: Z3 -* **PB Solvers**: Exact, Pindakaas -* **SAT Solvers**: PySAT+solvers, PySDD +* **PB Solvers**: Exact +* **SAT Encoders and Solvers**: PySAT+solvers, Pindakaas +* **Decision Diagrams**: PySDD ### </> Example: flexible jobshop scheduling diff --git a/cpmpy/solvers/hexaly.py b/cpmpy/solvers/hexaly.py index 49d97eb77..b547ba71c 100644 --- a/cpmpy/solvers/hexaly.py +++ b/cpmpy/solvers/hexaly.py @@ -6,8 +6,7 @@ """ Interface to Hexaly's API - - Hexaly is a local search solver with support for global constraints. + Hexaly is a global optimization solver that supports nonlinear and a few global constraints. Always use :func:`cp.SolverLookup.get("hexaly") ` to instantiate the solver object. @@ -21,7 +20,7 @@ $ pip install hexaly -i https://pip.hexaly.com - The Hexaly local solver requires an active licence (for example a free academic license) + It also requires to install the Hexaly Optimizer with a Hexaly license (for example a free academic license) You can read more about available licences at https://www.hexaly.com/ See detailed installation instructions at: diff --git a/docs/index.rst b/docs/index.rst index 1cabdd9d9..076375b90 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -60,6 +60,11 @@ Supported solvers - SAT ASAT ISAT - OPT IOPT - pip - + * - :doc:`Hexaly ` + - Global Opt. + - SAT ALLSAT - OPT IOPT + - pip + local + (aca.) licence + - * - :doc:`Gurobi ` - ILP - SAT - OPT IOPT - PAR @@ -76,27 +81,21 @@ Supported solvers - pip >3.10 (Linux, Win) - Manual installation on Mac possible * - :doc:`Pindakaas ` - - Pseudo-Boolean + - SAT Encoder - SAT - local install (git + pip > 3.10) - - Encodes to SAT + - * - :doc:`PySAT ` - SAT - SAT ASAT ISAT - pip - * - :doc:`PySDD ` - - SAT Counter + - Decis. Diagram - SAT ISAT ALLSAT - KC - pip - only Boolean variables (CPMpy transformation incomplete) - * - :doc:`Hexaly ` - - Local search - - SAT ALLSAT - OPT IOPT - - pip + local + (aca.) licence - - - Native capability abbreviations: * SAT: Satisfaction, ASAT: Satisfaction under Assumptions+core extraction, ISAT: Incremental Satisfaction, ALLSAT: All solution enumeration * OPT: Optimisation, IOPT: Incremental optimisation From 1186d28c87e700667b0eb8ccf8e7632f421c6041 Mon Sep 17 00:00:00 2001 From: Henk Bierlee Date: Mon, 12 Jan 2026 15:09:51 +0100 Subject: [PATCH 060/152] Fix/bug810 handle pdk unsat with conditions (#811) * Handle pdk.Unsatisfiable for add_encoding w/ conditions * Fix polarity of conditions * Refactor conditions handling * Reformat * Customize test for specific solvers * Update pindakaas.py some long lines should not be split, makes them unreadable --------- Co-authored-by: Tias Guns --- cpmpy/solvers/pindakaas.py | 56 +++++++++++++++++++++------------ cpmpy/transformations/to_cnf.py | 4 +-- tests/test_int2bool.py | 1 + tests/test_solvers.py | 11 +++++++ 4 files changed, 50 insertions(+), 22 deletions(-) diff --git a/cpmpy/solvers/pindakaas.py b/cpmpy/solvers/pindakaas.py index 2291aa3e1..0bdf84141 100755 --- a/cpmpy/solvers/pindakaas.py +++ b/cpmpy/solvers/pindakaas.py @@ -42,8 +42,8 @@ from typing import Optional, List from ..exceptions import NotSupportedError -from ..expressions.utils import eval_comparison from ..expressions.core import BoolVal, Comparison +from ..expressions.utils import eval_comparison from ..expressions.variables import NegBoolView, _BoolVarImpl, _IntVarImpl from ..transformations.decompose_global import decompose_in_tree from ..transformations.flatten_model import flatten_constraint @@ -91,9 +91,10 @@ def supported(): @staticmethod def version() -> Optional[str]: """Return the installed version of the solver's Python API.""" - from importlib.metadata import version, PackageNotFoundError + from importlib.metadata import PackageNotFoundError, version + try: - return version('pindakaas') + return version("pindakaas") except PackageNotFoundError: return None @@ -232,15 +233,17 @@ def solver_var(self, cpm_var): enc = self.ivarmap[cpm_var.name] return self.solver_vars(enc.vars()) else: - raise TypeError + raise TypeError(f"Unexpected type: {cpm_var}") def transform(self, cpm_expr): cpm_cons = toplevel_list(cpm_expr) cpm_cons = no_partial_functions(cpm_cons, safen_toplevel={"div", "mod", "element"}) - cpm_cons = decompose_in_tree(cpm_cons, - supported=self.supported_global_constraints | {"alldifferent"}, # alldiff has a specialized MIP decomp in linearize - supported_reified=self.supported_reified_global_constraints, - csemap=self._csemap) + cpm_cons = decompose_in_tree( + cpm_cons, + supported=self.supported_global_constraints | {"alldifferent"}, # alldiff has a specialized MIP decomp in linearize + supported_reified=self.supported_reified_global_constraints, + csemap=self._csemap, + ) cpm_cons = simplify_boolean(cpm_cons) cpm_cons = flatten_constraint(cpm_cons, csemap=self._csemap) # flat normal form cpm_cons = only_bv_reifies(cpm_cons, csemap=self._csemap) @@ -271,33 +274,34 @@ def add(self, cpm_expr_orig): __add__ = add # avoid redirect in superclass - def _add_clause(self, cpm_expr): - if not isinstance(cpm_expr, list): + def _add_clause(self, clause, conditions=[]): + """Add a clause implied by conditions; both arguments are lists of CPMpy literals.""" + if not isinstance(clause, list): raise TypeError - self.pdk_solver.add_clause(self.solver_vars(cpm_expr)) + self.pdk_solver.add_clause(self.solver_vars([~c for c in conditions] + clause)) def _post_constraint(self, cpm_expr, conditions=[]): if not isinstance(conditions, list): raise TypeError """Add a single, *transformed* constraint, implied by conditions.""" + import pindakaas as pdk + if isinstance(cpm_expr, BoolVal): # base case: Boolean value if cpm_expr.args[0] is False: - self._add_clause(conditions) + self._add_clause([], conditions=conditions) elif isinstance(cpm_expr, _BoolVarImpl): # (implied) literal - self._add_clause(conditions + [cpm_expr]) + self._add_clause([cpm_expr], conditions=conditions) elif cpm_expr.name == "or": # (implied) clause - self._add_clause(conditions + cpm_expr.args) + self._add_clause(cpm_expr.args, conditions=conditions) elif cpm_expr.name == "->": # implication a0, a1 = cpm_expr.args - if not isinstance(a0, _BoolVarImpl): - raise TypeError - self._post_constraint(a1, conditions=conditions + [~a0]) + self._post_constraint(a1, conditions=conditions + [a0]) elif isinstance(cpm_expr, Comparison): # Bool linear assert cpm_expr.name in {"<=", ">=", "=="}, ( @@ -319,9 +323,21 @@ def _post_constraint(self, cpm_expr, conditions=[]): lhs = sum(c * l for c, l in zip(coefficients, self.solver_vars(literals))) - self.pdk_solver.add_encoding( - eval_comparison(cpm_expr.name, lhs, rhs), conditions=self.solver_vars(conditions) - ) + try: + # normalization may raise `pdk.Unsatisfiable` + self.pdk_solver.add_encoding( + eval_comparison(cpm_expr.name, lhs, rhs), + # seems pindakaas conditions are the wrong way around + conditions=self.solver_vars([~c for c in conditions]), + ) + except pdk.Unsatisfiable as e: + if conditions: + # trivial unsat with conditions does not count; posts ~conditions + # `add_clause` may raise `pdk.Unsatisfiable` too, but the conditions are added to the clause, so no need to catch + self._add_clause([], conditions=conditions) + else: + # no conditions means truly unsatisfiable + raise e else: raise NotSupportedError(f"{self.name}: Unsupported constraint {cpm_expr}") diff --git a/cpmpy/transformations/to_cnf.py b/cpmpy/transformations/to_cnf.py index 2dfb25842..a1c0f2f0d 100644 --- a/cpmpy/transformations/to_cnf.py +++ b/cpmpy/transformations/to_cnf.py @@ -36,9 +36,9 @@ def to_cnf(constraints, csemap=None, ivarmap=None, encoding="auto"): # the encoded constraints (i.e. `PB`s) will be added to this `pdk.CNF` object slv.pdk_solver = pdk.CNF() - # however, we bypass `pindakaas` for simple clauses + # however, we bypass `pindakaas` for simple clauses for efficiency clauses = [] - slv._add_clause = lambda cpm_expr: clauses.append(cp.any(cpm_expr)) + slv._add_clause = lambda clause, conditions=[]: clauses.append(cp.any([~c for c in conditions] + clause)) # add, transform, and encode constraints into CNF/clauses slv += constraints diff --git a/tests/test_int2bool.py b/tests/test_int2bool.py index 964bbed6e..14297f6e5 100644 --- a/tests/test_int2bool.py +++ b/tests/test_int2bool.py @@ -54,6 +54,7 @@ Comparison(cmp, Operator("wsum", [[2, 3, 5], [x, y, z]]), -10), Comparison(cmp, Operator("wsum", [[2, 3, 5], [x, y, z]]), 100), # where ub(lhs) Date: Tue, 20 Jan 2026 18:21:48 +0100 Subject: [PATCH 061/152] miplib dataset --- cpmpy/tools/dataset/model/miplib.py | 110 ++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 cpmpy/tools/dataset/model/miplib.py diff --git a/cpmpy/tools/dataset/model/miplib.py b/cpmpy/tools/dataset/model/miplib.py new file mode 100644 index 000000000..c5a169cf4 --- /dev/null +++ b/cpmpy/tools/dataset/model/miplib.py @@ -0,0 +1,110 @@ +""" +MIPLib Dataset + +https://maxsat-evaluations.github.io/ +""" + + +import os +import gzip +import zipfile +import pathlib + +from cpmpy.tools.dataset._base import _Dataset + + +class MIPLibDataset(_Dataset): # torch.utils.data.Dataset compatible + + + def __init__( + self, + root: str = ".", + year: int = 2024, track: str = "exact-unweighted", + transform=None, target_transform=None, + download: bool = False + ): + """ + Constructor for a dataset object of the MSE competition. + + Arguments: + root (str): Root directory where datasets are stored or will be downloaded to (default="."). + year (int): Competition year of the dataset to use (default=2024). + track (str): Track name specifying which subset of the competition instances to load (default="exact-unweighted"). + transform (callable, optional): Optional transform applied to the instance file path. + target_transform (callable, optional): Optional transform applied to the metadata dictionary. + download (bool): If True, downloads the dataset if it does not exist locally (default=False). + + + Raises: + ValueError: If the dataset directory does not exist and `download=False`, + or if the requested year/track combination is not available. + """ + + self.root = pathlib.Path(root) + self.year = year + self.track = track + + # # Check requested dataset + # if not str(year).startswith('20'): + # raise ValueError("Year must start with '20'") + # if not track: + # raise ValueError("Track must be specified, e.g. OPT-LIN, DEC-LIN, ...") + + dataset_dir = self.root / "miplib" + + super().__init__( + dataset_dir=dataset_dir, + transform=transform, target_transform=target_transform, + download=download, extension=".mps.gz" + ) + + + def category(self) -> dict: + return { + "year": self.year, + "track": self.track + } + + + def download(self): + print("Downloading MIPLib instances...") + + zip_name = "collection.zip" + url = "https://miplib.zib.de/downloads/" + + dataset_dir = self.root / "miplib" + + if dataset_dir.exists(): + print(f"Using existing dataset directory: {dataset_dir}") + else: + print(f"Downloading {zip_name}...") + try: + cached_filepath = super().download_file(url, target=zip_name, desc=zip_name) + except ValueError as e: + raise ValueError(f"No dataset available. Error: {str(e)}") + + # Extract only the specific track folder from the tar + with zipfile.ZipFile(cached_filepath, 'r') as zip_ref: + # Create track folder in root directory, parents=True ensures recursive creation + self.dataset_dir.mkdir(parents=True, exist_ok=True) + + # Extract files + for file_info in zip_ref.infolist(): + # Extract file to family_dir, removing main_folder/track prefix + filename = pathlib.Path(file_info.filename).name + with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: + target.write(source.read()) + # Do not cleanup cached file, as it is in the global cache directory + # zip_path.unlink() + + def open(self, instance: os.PathLike) -> callable: + return gzip.open(instance, "rt") if str(instance).endswith(".gz") else open(instance) + +if __name__ == "__main__": + dataset = MIPLibDataset(download=True) + print("Dataset size:", len(dataset)) + print("Instance 0:", dataset[0]) + + from cpmpy.tools.mps import read_mps + model = read_mps(dataset[0][0], open=dataset.open) + print(model) From 54031bdcef26e8919160f0c676e3727c6e60b734 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 20 Jan 2026 18:22:05 +0100 Subject: [PATCH 062/152] start of reader and writer --- cpmpy/tools/mps/__init__.py | 21 ++ cpmpy/tools/mps/parser.py | 724 ++++++++++++++++++++++++++++++++++++ 2 files changed, 745 insertions(+) create mode 100644 cpmpy/tools/mps/__init__.py create mode 100644 cpmpy/tools/mps/parser.py diff --git a/cpmpy/tools/mps/__init__.py b/cpmpy/tools/mps/__init__.py new file mode 100644 index 000000000..540863b84 --- /dev/null +++ b/cpmpy/tools/mps/__init__.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## __init__.py +## +""" +Set of utilities for working with MPS-formatted LP/MIP models. + + +================== +List of submodules +================== + +.. autosummary:: + :nosignatures: + + parser +""" + +from .parser import read_mps +from .parser import write_mps \ No newline at end of file diff --git a/cpmpy/tools/mps/parser.py b/cpmpy/tools/mps/parser.py new file mode 100644 index 000000000..3fb2cd459 --- /dev/null +++ b/cpmpy/tools/mps/parser.py @@ -0,0 +1,724 @@ +""" +Parser for the MPS format. + + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read_mps + write_mps + +======================== +List of helper functions +======================== + +.. autosummary:: + :nosignatures: + + _parse_mps + _load_mps +""" + +from __future__ import annotations + +import os +import cpmpy as cp +import numpy as np +from io import StringIO +from typing import Any, List, Optional, TextIO, Tuple, Union +from enum import Enum + +from cpmpy.transformations.comparison import only_numexpr_equality +from cpmpy.transformations.decompose_global import decompose_in_tree +from cpmpy.transformations.flatten_model import flatten_constraint +from cpmpy.transformations.get_variables import get_variables +from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv +from cpmpy.transformations.normalize import toplevel_list +from cpmpy.transformations.reification import only_implies, reify_rewrite, only_bv_reifies +from cpmpy.transformations.safening import no_partial_functions + + +class ConstraintType(Enum): + EQUAL = "E" # == + GREATER_THAN = "G" # > + LESS_THAN = "L" # < + NON_CONSTRAINING = "N" # objective + +class VariableType(Enum): + INTEGER = "I" + CONTINUOUS = "C" # not supported + FLOATING_POINT = "F" # not supported + BINARY = "B" + FREE = "F" # not supported + CONSTANT = "X" # only integers (for now float constants not supported, even in objective function) + +def _get_constraint_type(constraint_type: str) -> ConstraintType: + """ + Gets the constraint type from a string. + + Arguments: + constraint_type (str): The constraint type string. + + Returns: + ConstraintType: The constraint type. + """ + if constraint_type == "E": + return ConstraintType.EQUAL + elif constraint_type == "G": + return ConstraintType.GREATER_THAN + elif constraint_type == "L": + return ConstraintType.LESS_THAN + elif constraint_type == "N": + return ConstraintType.NON_CONSTRAINING + else: + raise ValueError(f"Invalid constraint type: {constraint_type}") + +class MPS: + + _metadata = dict() # metadata on the MPS instance + _row_map = dict() # maps constraint names to types of constraint (ConstraintType) + objective = None # name of the expression which represents the objective + minimize = True # direction of optimisation + _A_matrix = {} # A matrix (variable x constraint) + _rhs_map = dict() # right hand side of the expressions, maps expression name to its rhs + _lb_map = dict() # lower bounds of the variables, maps variable name to its lb + _ub_map = dict() # upper bounds of the variables, maps variable name to its ub + _type_map = dict() # for each variable name, stores the type of variable it represents (VariableType) + _intorg = False # state management for the INTORG marker (in COLUMNS section) + + + def __init__(self, assume_interger_variables:bool=True): + """ + Initializes the MPS object. + + Arguments: + assume_interger_variables (bool): Whether to assume integer variables. Default is True. + If True, floating point variables will be converted to integer variables. + If False, floating point variables will be kept as floating point variables + and an exception will be raised (cpmpy does not support floating point decision variables) + """ + self.ASSUME_INTEGER_VARIABLES = assume_interger_variables + + @property + def metadata(self) -> dict: + """ + Returns the metadata of the MPS instance. + + Returns: + dict: The metadata of the MPS instance. + """ + return self._metadata + + def _get_bounds(self, variable_name:str) -> Tuple: + lb = self._lb_map.get(variable_name, 0) + if variable_name not in self._ub_map: + raise ValueError(f"Upper bound not found for variable: {variable_name}. CPMpy does not support unbounded variables.") + ub = self._ub_map[variable_name] + return lb, ub + + + def to_cpmpy(self, model_constants:bool=False, filter_zeros:bool=True) -> cp.Model: + """ + Converts the MPS instance to a CPMpy model. + + Returns: + cp.Model: The CPMpy model. + """ + + _var_map = dict() + + def _get_variable(variable_name: str): + if variable_name not in _var_map: + + type = self._type_map.get(variable_name, VariableType.FREE) + if type == VariableType.INTEGER: + _var_map[variable_name] = cp.intvar(name=variable_name, lb=self._get_bounds(variable_name)[0], ub=self._get_bounds(variable_name)[1]) + elif type == VariableType.FLOATING_POINT: + if self.ASSUME_INTEGER_VARIABLES: + _var_map[variable_name] = cp.intvar(name=variable_name, lb=int(self._get_bounds(variable_name)[0]), ub=int(self._get_bounds(variable_name)[1])) + else: + raise ValueError(f"Floating point variables are not supported: {variable_name}") + elif type == VariableType.BINARY: + _var_map[variable_name] = cp.boolvar(name=variable_name) + elif type == VariableType.CONSTANT: + if model_constants: + _var_map[variable_name] = cp.intvar(name=variable_name, lb=self._get_bounds(variable_name)[0], ub=self._get_bounds(variable_name)[0]) + else: + _var_map[variable_name] = self._get_bounds(variable_name)[0] + else: + raise ValueError(f"Invalid variable type: {type} for variable: {variable_name}") + + return _var_map[variable_name] + + def _get_variables(variable_names: list[str]): + return np.array([_get_variable(variable_name) for variable_name in variable_names]) + + model = cp.Model() + + inverted_A_matrix = self.invert_A_matrix() + + for constraint_name, constraint_type in self._row_map.items(): + print(constraint_name, constraint_type) + if constraint_type == ConstraintType.NON_CONSTRAINING: + obj_array = np.array(list(inverted_A_matrix[constraint_name].values())) * _get_variables(list(inverted_A_matrix[constraint_name].keys())) + if filter_zeros: + obj_array = [o for o in obj_array if not (isinstance(o, (int, np.integer)) and o == 0)] + objective = cp.sum(obj_array) + if self.minimize: + model.minimize(objective) + else: + model.maximize(objective) + + else: + if constraint_type == ConstraintType.EQUAL: + lhs = np.array(list(inverted_A_matrix[constraint_name].values())) * _get_variables(list(inverted_A_matrix[constraint_name].keys())) + if filter_zeros: + lhs = [l for l in lhs if not (isinstance(l, (int, np.integer)) and l == 0)] + model += cp.sum(lhs) == self._rhs_map[constraint_name] + elif constraint_type == ConstraintType.GREATER_THAN: + lhs = np.array(list(inverted_A_matrix[constraint_name].values())) * _get_variables(list(inverted_A_matrix[constraint_name].keys())) + if filter_zeros: + lhs = [l for l in lhs if not (isinstance(l, (int, np.integer)) and l == 0)] + model += cp.sum(lhs) >= self._rhs_map[constraint_name] + elif constraint_type == ConstraintType.LESS_THAN: + lhs = cp.cpm_array(list(inverted_A_matrix[constraint_name].values())) * _get_variables(list(inverted_A_matrix[constraint_name].keys())) + if filter_zeros: + lhs = [l for l in lhs if not (isinstance(l, int) and l.value != 0)] + model += cp.sum(list(lhs)) <= self._rhs_map[constraint_name] + else: + raise ValueError(f"Invalid constraint type: {constraint_type} for constraint: {constraint_name}") + + return model + + @classmethod + def _transform(cls, cpm_cons: list[cp.Expression], csemap: dict) -> list[cp.Expression]: + """ + Transforms a list of CPMpy expressions to a list of linearised expressions, compatible with the MPS format. + + Arguments: + cpm_cons (list[cp.Expression]): The list of CPMpy expressions to transform. + csemap (dict): The context-sensitive evaluation map. + """ + # TODO: for now just straight copy from CPM_gurobi + cpm_cons = toplevel_list(cpm_cons) + cpm_cons = no_partial_functions(cpm_cons, safen_toplevel={"mod", "div"}) # linearize expects safe exprs + supported = {"min", "max", "abs", "alldifferent"} # alldiff has a specialized MIP decomp in linearize + cpm_cons = decompose_in_tree(cpm_cons, supported, csemap=csemap) + cpm_cons = flatten_constraint(cpm_cons, csemap=csemap) # flat normal form + cpm_cons = reify_rewrite(cpm_cons, supported=frozenset(['sum', 'wsum']), csemap=csemap) # constraints that support reification + cpm_cons = only_numexpr_equality(cpm_cons, supported=frozenset(["sum", "wsum", "sub"]), csemap=csemap) # supports >, <, != + cpm_cons = only_bv_reifies(cpm_cons, csemap=csemap) + cpm_cons = only_implies(cpm_cons, csemap=csemap) # anything that can create full reif should go above... + print(cpm_cons) + # gurobi does not round towards zero, so no 'div' in supported set: https://github.com/CPMpy/cpmpy/pull/593#issuecomment-2786707188 + cpm_cons = linearize_constraint(cpm_cons, supported=frozenset({"sum", "wsum", "sub"}), csemap=csemap) # the core of the MIP-linearization + print(cpm_cons) + cpm_cons = only_positive_bv(cpm_cons, csemap=csemap) # after linearization, rewrite ~bv into 1-bv + return cpm_cons + + @classmethod + def from_cpmpy(cls, model: cp.Model) -> MPS: + """· + Converts a CPMpy model to an MPS object. + + Arguments: + model (cp.Model): The CPMpy model to convert. + """ + cpm_expr = model.constraints + for c in cpm_expr: + print(c) + csemap = dict() + cpm_cons = cls._transform(cpm_expr, csemap=csemap) + for c in cpm_cons: + print(c) + + mps_obj = MPS() + + # -------------------------------- Constraints ------------------------------- # + + for i, cpm_con in enumerate(cpm_cons): + if isinstance(cpm_con, cp.expressions.core.Comparison): + # Comparison type + if cpm_con.name == "==": + mps_obj.set_constraint_type(f'c{i}', ConstraintType.EQUAL) + elif cpm_con.name == ">=": + mps_obj.set_constraint_type(f'c{i}', ConstraintType.GREATER_THAN) + elif cpm_con.name == "<=": + mps_obj.set_constraint_type(f'c{i}', ConstraintType.LESS_THAN) + else: + raise ValueError(f"Invalid comparison operator: {cpm_con.name}") + + # LHS + if cpm_con.args[0].name == "wsum": + weights, variables = cpm_con.args + for weight, variable in zip(weights, variables): + mps_obj.update_column(f'c{i}', variable.name, weight) + elif cpm_con.args[0].name == "sum": + variables_with_weights = cpm_con.args + weights, variables = zip[tuple[Any, ...]](*[(a.args[0], a.args[1]) if isinstance(a, cp.Operator) and a.name == "mul" and len(a.args) == 2 else (1, a) for a in variables_with_weights]) + for weight, variable in zip(weights, variables): + mps_obj.update_column(f'c{i}', variable.name, weight) + else: + raise ValueError(f"Invalid constraint type: {type(cpm_con.args[0])}") + + # RHS + mps_obj.update_rhs(f'c{i}', cpm_con.args[1]) + + else: + raise ValueError(f"Invalid constraint type: {type(cpm_con)}") + + # --------------------------------- Variables -------------------------------- # + + variables = get_variables(cpm_cons) + for variable in variables: + lb, up = variable.get_bounds() + mps_obj.update_bounds(variable.name, "LI", lb) + mps_obj.update_bounds(variable.name, "UI", up) + + # --------------------------------- Objective -------------------------------- # + + objective = cls._transform(model.objective, csemap=csemap) + objective_name = 'min' if model.minimize else 'max' + 'obj' + mps_obj.minimize = model.minimize + mps_obj.set_constraint_type(objective_name, ConstraintType.NON_CONSTRAINING) + if objective.name == "wsum": + weights, variables = objective.args + for weight, variable in zip(weights, variables): + mps_obj.update_column(objective_name, variable.name, weight) + elif objective.name == "sum": + variables_with_weights = objective.args + weights, variables = zip[tuple[Any, ...]](*[(a.args[0], a.args[1]) if isinstance(a, cp.Operator) and a.name == "mul" and len(a.args) == 2 else (1, a) for a in variables_with_weights]) + for weight, variable in zip(weights, variables): + mps_obj.update_column(objective_name, variable.name, weight) + else: + raise ValueError(f"Invalid constraint type: {type(objective)}") + + # ------------------------------------- - ------------------------------------ # + + return mps_obj + + @classmethod + def _format_space(cls, string:str, space:Optional[int]=None, leading:int=0) -> str: + if space is None: + space=len(string) + if len(string) < space: + return f"{'':<{leading}}{string:<{space}}" + else: + raise ValueError(f"String {string} is longer than {space} characters") + + @classmethod + def _format_line(cls, strings, spaces, format:str, leading:int=0) -> str: + if format == "fixed": + line = cls._format_string(strings[0], spaces[0], leading=leading) + if len(strings) > 1: + line +=''.join([cls._format_string(string,space) for (string,space) in zip(strings[1:], spaces[1:])]) + return line + elif format == "free": + return cls._format_space('', leading) + ' '.join(val for pair in zip(strings, spaces) for val in pair) + else: + raise ValueError(f"Invalid format: {format}") + + @classmethod + def _write_name(cls, name, format:str) -> str: + return cls._format_line(('Name',), (14,), format=format) + + @classmethod + def _write_objective(cls, minimize:bool, format:str) -> str: + return cls._format_line(('N', f"{'min' if minimize else 'max'}obj"), (4, None), leading=1) + + @classmethod + def _write_row(cls, row_name:str, constraint_type: ConstraintType, format:str) -> str: + return cls._format_line((constraint_type.value, row_name), (4, None), leading=1, format=format) + + @classmethod + def _write_opening_marker(cls, format:str): + return cls._format_line(('MARK0000', 'MARKER', 'INTORG'), (10, 20, None), leading=4, format=format) + + @classmethod + def _write_closing_marker(cls, format): + return cls._format_line(('MARK0001', 'MARKER', 'INTEND'), (10, 20, None), leading=4, format=format) + + @classmethod + def _write_column(cls, column_name:str, variables_with_coefficients:List[Tuple[str, int]], format:str) -> str: + for a,b in zip(variables_with_coefficients[::2],variables_with_coefficients[1::2]): + yield cls._format_line((column_name, a[0], a[1], b[0], b[1]), (10, 10, 5, 10, 5), leading=4, format=format) + if len(variables_with_coefficients) % 2 != 0: + yield cls._format_line((column_name, variables_with_coefficients[-1][0], variables_with_coefficients[-1][1]), (10, 10, 5), leading=4, format=format) + + @classmethod + def _write_rhs(cls, variables_with_coefficients:List[Tuple[str, int]], format:str) -> str: + for a,b in zip(variables_with_coefficients[::2],variables_with_coefficients[1::2]): + yield cls._format_line(('rhs', a[0], a[1], b[0], b[1]), (10, 21, 5, 21, 5), leading=4, format=format) + if len(variables_with_coefficients) % 2 != 0: + yield cls._format_line(('rhs', variables_with_coefficients[-1][0], variables_with_coefficients[-1][1]), (10, 21, 5), leading=4, format=format) + + + def write_mps(self, file_path: Optional[str] = None, format: str = "fixed"): + mps_string = [] + + + if format == "fixed": + # Name + mps_string.append(self._write_name(self._metadata['name'], format=format)) + # Rows + mps_string.append("ROWS") + mps_string.append(self._write_objective(self.minimize, format=format)) + + for row_name, constraint_type in self._row_map.keys(): + mps_string.append(self._write_row(row_name, constraint_type)) + # Columns + mps_string.append("COLUMNS") + mps_string.append(self._write_opening_marker()) + for column_name, column_rows in self._A_matrix.items(): + for line in self._write_column(column_name, zip(column_rows.keys(), column_rows.values()), format=format): + mps_string.append(line) + mps_string.append(self._write_closing_marker()) + # RHS + mps_string.append("RHS") + for line in self._write_rhs(zip(self._rhs_map.keys(), self._rhs_map.values()), format=format): + mps_string.append(line) + # Bounds + mps_string.append("BOUNDS") + for row_name in self._row_map.keys(): + variable_type = self._type_map[row_name] + if variable_type == VariableType.FLOATING_POINT: + if row_name in self._lb_map: + mps_string.append(self._format_line(('LO', 'bnd', row_name, self._lb_map.get(row_name, 0)), (3, 10, 21, None), leading=1, format=format)) + if row_name in self._ub_map: + mps_string.append(self._format_line(('UP', 'bnd', row_name, self._ub_map[row_name]), (3, 10, 21, None), leading=1, format=format)) + elif variable_type == VariableType.INTEGER: + if row_name in self._lb_map: + mps_string.append(self._format_line(('LI', 'bnd', row_name, self._lb_map.get(row_name, 0)), (3, 10, 21, None), leading=1, format=format)) + if row_name in self._ub_map: + mps_string.append(self._format_line(('UI', 'bnd', row_name, self._ub_map[row_name]), (3, 10, 21, None), leading=1, format=format)) + elif variable_type == VariableType.BINARY: + mps_string.append(self._format_line(('BV', 'bnd', row_name, self._lb_map.get(row_name, 0)), (3, 10, 21, None), leading=1, format=format)) + elif variable_type == VariableType.CONSTANT: + mps_string.append(self._format_line(('FX', 'bnd', row_name, self._lb_map[row_name]), (3, 10, 21, None), leading=1, format=format)) + else: + raise ValueError(f"Invalid variable type: {variable_type} for variable: {row_name}") + # End + mps_string.append("ENDATA") + + mps_string = "\n".join(mps_string) + + if file_path is not None: + with open(file_path, "w") as f: + f.write(mps_string) + + return mps_string + + def set_objective(self, expression_name: str): + """ + Sets the name of the expression that represents the objective. + + Arguments: + expression_name (str): The name of the expression that represents the objective. + """ + self.objective = expression_name + + def set_constraint_type(self, constraint_name: str, constraint_type: ConstraintType): + """ + Sets the type of a constraint. + + Arguments: + constraint_name (str): The name of the constraint. + constraint_type (ConstraintType): The type of the constraint. + """ + self._row_map[constraint_name] = constraint_type + + def set_marker(self, marker: str): + """ + Sets the marker for the INTORG/INTEND section. + + Arguments: + marker (str): The marker to set. + """ + if "'INTORG'" == marker: + self._intorg = True + elif "'INTEND'" == marker: + self._intorg = False + + def update_column(self, column_name: str, row_name: str, row_coeff: str): + """ + Updates the A matrix. + + Arguments: + column_name (str): The name of the column. + row_name (str): The name of the row. + row_coeff (str): The coefficient of the row. + """ + if self._intorg: + row_coeff = int(row_coeff) + else: + if self.ASSUME_INTEGER_VARIABLES: + row_coeff = int(row_coeff) + else: + raise ValueError(f"Floating point variables are not supported: {row_coeff}") + self._A_matrix[column_name] = self._A_matrix.get(column_name, {}) | {row_name: row_coeff} + + def update_rhs(self, row_name: str, row_coeff: str): + """ + Updates the right hand side of a constraint. + + Arguments: + row_name (str): The name of the constraint. + row_coeff (str): The right hand side of the constraint. + """ + if self._intorg: + row_coeff = int(row_coeff) + else: + if self.ASSUME_INTEGER_VARIABLES: + row_coeff = int(row_coeff) + elif row_coeff != int(row_coeff): + raise ValueError(f"Floating point variables are not supported: {row_coeff}") + else: + row_coeff = int(row_coeff) + self._rhs_map[row_name] = row_coeff + + def update_bounds(self, row_name: str, type: str, bound_value: str): + """ + Updates the bounds of a variable. + + Arguments: + row_name (str): The name of the variable. + type (str): The type of the bound. + bound_value (str): The value of the bound. + """ + if type == "LO": + self._type_map[row_name] = VariableType.FLOATING_POINT + if self.ASSUME_INTEGER_VARIABLES: + self._lb_map[row_name] = int(bound_value) + else: + if bound_value != int(bound_value): + raise ValueError(f"Floating point bounds are not supported: {bound_value}") + self._lb_map[row_name] = int(bound_value) + elif type == "UP": + self._type_map[row_name] = VariableType.FLOATING_POINT + if self.ASSUME_INTEGER_VARIABLES: + self._ub_map[row_name] = int(bound_value) + else: + if bound_value != int(bound_value): + raise ValueError(f"Floating point bounds are not supported: {bound_value}") + self._ub_map[row_name] = int(bound_value) + elif type == "FX": + self._type_map[row_name] = VariableType.CONSTANT + if bound_value != int(bound_value): + if self.ASSUME_INTEGER_VARIABLES: + bound_value = int(bound_value) + else: + raise ValueError(f"Floating point bounds are not supported: {bound_value}") + self._lb_map[row_name] = int(bound_value) + self._ub_map[row_name] = int(bound_value) + elif type == "BV": + self._type_map[row_name] = VariableType.BINARY + self._lb_map[row_name] = 0 + self._ub_map[row_name] = 1 + elif type == "LI": + self._type_map[row_name] = VariableType.INTEGER + self._lb_map[row_name] = int(bound_value) + elif type == "UI": + self._type_map[row_name] = VariableType.INTEGER + self._ub_map[row_name] = int(bound_value) + elif type == "SC": + pass + elif type == "SI": + pass + elif type == "FR": + pass + elif type == "MI": + pass + elif type == "PL": + pass + else: + raise ValueError(f"Invalid bound type: {type}") + + def invert_A_matrix(self): + """ + Inverts the A matrix, becoming a (constraint x variable) matrix. + + Returns: + dict: The inverted A matrix. + """ + inverted_A_matrix = dict() + for column_name, column_rows in self._A_matrix.items(): + for row_name, row_coeff in column_rows.items(): + inverted_A_matrix[row_name] = inverted_A_matrix.get(row_name, {}) | {column_name: row_coeff} + return inverted_A_matrix + + @classmethod + def _read_line(cls, line, starts:List[int], format:str, required:Optional[List[bool]]=None) -> List: + if required is not None: + for i, (s, r) in enumerate(zip(starts, required)): + if s >= len(line): + if r: + raise ValueError(f"Required field {i} is missing") + else: + i -= 1 + break + starts = starts[:i+1] + if format == "fixed": + res = [] + for a,b in zip(starts[:], starts[1:]): + res.append(line[a:b].strip()) + res.append(line[starts[-1]:].strip()) + if required is not None: + res += [None]*(len(required)-len(starts)) + return res + elif format == "free": + return line.split() + [None]*(len(required)-len(starts)) if required is not None else line.split() + else: + raise ValueError(f"Invalid format: {format}") + + +def _parse_mps(f: TextIO, format: str = "fixed", **kwargs) -> MPS: + """ + Parses an MPS string and returns an MPS object. + + Arguments: + mps (str): The MPS string to parse. + """ + + mps_obj = MPS() + + lines = f.readlines() + i = 0 + + while i < len(lines): + line = lines[i] + print(line) + + if line.startswith("NAME"): + mps_obj._metadata["name"] = mps_obj._read_line(line, (15,), format=format) + i += 1 + line = lines[i] + elif line.startswith("OBJSENSE"): # optional, not part of core specification + direction = mps_obj._read_line(line, (9,), format=format) + if direction == "MIN": + pass # default is minimize + elif direction == "MAX": + mps_obj.minimize = False + else: + raise ValueError(f"Invalid optimisation direction: {direction}") + i += 1 + elif line.startswith("*"): # comment line + i += 1 + elif line.startswith("ROWS"): # name of constraints + i += 1 + line = lines[i] + while i < len(lines) and (line[0] == " " or line[0] == "*"): + # create mapping of constraint name to constraint type + constraint_type, constraint_name = mps_obj._read_line(line, (1, 4), format=format) + constraint_type = _get_constraint_type(constraint_type.lstrip()) # operators can be in column 2 or 3 + print(constraint_name) + if constraint_type == ConstraintType.NON_CONSTRAINING: + mps_obj.set_objective(constraint_name) + mps_obj.set_constraint_type(constraint_name, constraint_type) + i += 1 + line = lines[i] + elif line.startswith("COLUMNS"): + i += 1 + line = lines[i] + while i < len(lines) and (line[0] == " " or line[0] == "*"): + if len(line) >= 32 and line[14:22] == "'MARKER'": + mps_obj.set_marker(line[24:34]) + else: + column_name, row_name, row_coeff, row2_name, row2_coeff = mps_obj._read_line(line, (4, 14, 35, 39, 60), required=(True, True, True, False, False), format=format) + mps_obj.update_column(column_name, row_name, row_coeff) + if row2_name is not None: + mps_obj.update_column(column_name, row2_name, row2_coeff) + i += 1 + line = lines[i] + elif line.startswith("RHS"): + i += 1 + line = lines[i] + while i < len(lines) and (line[0] == " " or line[0] == "*"): + + row_name, row_coeff, row2_name, row2_coeff = mps_obj._read_line(line, (14, 24, 39, 49), required=(True, True, False, False), format=format) + mps_obj.update_rhs(row_name, row_coeff) + if row2_name is not None: + mps_obj.update_rhs(row2_name, row2_coeff) + i += 1 + line = lines[i] + elif line.startswith("BOUNDS"): + i += 1 + line = lines[i] + while i < len(lines) and (line[0] == " " or line[0] == "*"): + type, _, row_name, bound_value = mps_obj._read_line(line, (1, 3, 14, 35), required=(True, True, True, False), format=format) + if bound_value is None: + bound_value = 0 + print(line) + print(row_name, type, bound_value) + mps_obj.update_bounds(row_name, type, bound_value) + i += 1 + line = lines[i] + elif line.startswith("ENDATA"): + break + else: + raise ValueError(f"Invalid line: {line}") + i += 1 + + return mps_obj + +def _load_mps(mps_obj: MPS, **kwargs) -> cp.Model: + """ + Loads an MPS object into a CPMpy model. + + Arguments: + mps_obj (MPS): The MPS object to load. + """ + return mps_obj.to_cpmpy(**kwargs) + + +_std_open = open +def read_mps(mps: Union[str, os.PathLike], open=open, format:str="fixed", **kwargs) -> cp.Model: + """ + Parser for MPS format. Reads in an instance and returns its matching CPMpy model. + + Arguments: + mps (str or os.PathLike): + - A file path to a MPS file + - OR a string containing the MPS content directly + open: (callable): + If mps is the path to a file, a callable to "open" that file (default=python standard library's 'open'). + format: (str): + The format of the MPS file. Can be "fixed" or "free". Default is "fixed". + """ + + # If mps is a path to a file -> open file + if isinstance(mps, (str, os.PathLike)) and os.path.exists(mps): + if open is not None: + f = open(mps) + else: + f = _std_open(mps, "rt") + # If mps is a string containing a model -> create a memory-mapped file + else: + f = StringIO(mps) + + + mps_obj = _parse_mps(f, format=format, **kwargs) + model = _load_mps(mps_obj, **kwargs) + return model + + +def write_mps(model: cp.Model, file_path: Optional[str] = None, format: str = "fixed") -> str: + """ + Writes a CPMpy model to an MPS string / file. + + Arguments: + model (cp.Model): The CPMpy model to write. + file_path (Optional[str]): Optional path to the MPS file to write. + + Returns: + str: The MPS string. + """ + mps_obj = MPS.from_cpmpy(model) + mps_string = mps_obj.write_mps(file_path, format=format) + return mps_string + + + From 3c5e1bc555ec3ad74911905a6d2a4cb1112a970b Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 22 Jan 2026 14:26:39 +0100 Subject: [PATCH 063/152] move datastructures from class to instance level --- cpmpy/tools/mps/parser.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/cpmpy/tools/mps/parser.py b/cpmpy/tools/mps/parser.py index 3fb2cd459..b31ed9a40 100644 --- a/cpmpy/tools/mps/parser.py +++ b/cpmpy/tools/mps/parser.py @@ -79,18 +79,6 @@ def _get_constraint_type(constraint_type: str) -> ConstraintType: class MPS: - _metadata = dict() # metadata on the MPS instance - _row_map = dict() # maps constraint names to types of constraint (ConstraintType) - objective = None # name of the expression which represents the objective - minimize = True # direction of optimisation - _A_matrix = {} # A matrix (variable x constraint) - _rhs_map = dict() # right hand side of the expressions, maps expression name to its rhs - _lb_map = dict() # lower bounds of the variables, maps variable name to its lb - _ub_map = dict() # upper bounds of the variables, maps variable name to its ub - _type_map = dict() # for each variable name, stores the type of variable it represents (VariableType) - _intorg = False # state management for the INTORG marker (in COLUMNS section) - - def __init__(self, assume_interger_variables:bool=True): """ Initializes the MPS object. @@ -102,6 +90,16 @@ def __init__(self, assume_interger_variables:bool=True): and an exception will be raised (cpmpy does not support floating point decision variables) """ self.ASSUME_INTEGER_VARIABLES = assume_interger_variables + self._metadata = dict() # metadata on the MPS instance + self._row_map = dict() # maps constraint names to types of constraint (ConstraintType) + self.objective = None # name of the expression which represents the objective + self.minimize = True # direction of optimisation + self._A_matrix = {} # A matrix (variable x constraint) + self._rhs_map = dict() # right hand side of the expressions, maps expression name to its rhs + self._lb_map = dict() # lower bounds of the variables, maps variable name to its lb + self._ub_map = dict() # upper bounds of the variables, maps variable name to its ub + self._type_map = dict() # for each variable name, stores the type of variable it represents (VariableType) + self._intorg = False # state management for the INTORG marker (in COLUMNS section) @property def metadata(self) -> dict: From b353a954cab699b4603b6d8720dbc3b9200ea95f Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 22 Jan 2026 14:27:22 +0100 Subject: [PATCH 064/152] Fix parser and add objective transformation --- cpmpy/tools/mps/parser.py | 236 ++++++++++++++++++++++---------------- 1 file changed, 137 insertions(+), 99 deletions(-) diff --git a/cpmpy/tools/mps/parser.py b/cpmpy/tools/mps/parser.py index b31ed9a40..0983ee139 100644 --- a/cpmpy/tools/mps/parser.py +++ b/cpmpy/tools/mps/parser.py @@ -33,13 +33,13 @@ from enum import Enum from cpmpy.transformations.comparison import only_numexpr_equality -from cpmpy.transformations.decompose_global import decompose_in_tree -from cpmpy.transformations.flatten_model import flatten_constraint +from cpmpy.transformations.decompose_global import decompose_in_tree, decompose_objective +from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective from cpmpy.transformations.get_variables import get_variables -from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv +from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv, only_positive_bv_wsum from cpmpy.transformations.normalize import toplevel_list from cpmpy.transformations.reification import only_implies, reify_rewrite, only_bv_reifies -from cpmpy.transformations.safening import no_partial_functions +from cpmpy.transformations.safening import no_partial_functions, safen_objective class ConstraintType(Enum): @@ -162,6 +162,7 @@ def _get_variables(variable_names: list[str]): for constraint_name, constraint_type in self._row_map.items(): print(constraint_name, constraint_type) if constraint_type == ConstraintType.NON_CONSTRAINING: + print(inverted_A_matrix) obj_array = np.array(list(inverted_A_matrix[constraint_name].values())) * _get_variables(list(inverted_A_matrix[constraint_name].keys())) if filter_zeros: obj_array = [o for o in obj_array if not (isinstance(o, (int, np.integer)) and o == 0)] @@ -192,8 +193,8 @@ def _get_variables(variable_names: list[str]): return model - @classmethod - def _transform(cls, cpm_cons: list[cp.Expression], csemap: dict) -> list[cp.Expression]: + @staticmethod + def _transform(cpm_cons: list[cp.Expression], csemap: dict) -> list[cp.Expression]: """ Transforms a list of CPMpy expressions to a list of linearised expressions, compatible with the MPS format. @@ -213,13 +214,29 @@ def _transform(cls, cpm_cons: list[cp.Expression], csemap: dict) -> list[cp.Expr cpm_cons = only_implies(cpm_cons, csemap=csemap) # anything that can create full reif should go above... print(cpm_cons) # gurobi does not round towards zero, so no 'div' in supported set: https://github.com/CPMpy/cpmpy/pull/593#issuecomment-2786707188 - cpm_cons = linearize_constraint(cpm_cons, supported=frozenset({"sum", "wsum", "sub"}), csemap=csemap) # the core of the MIP-linearization + cpm_cons = linearize_constraint(cpm_cons, supported=frozenset({"sum", "wsum", "sub"}), csemap=csemap, prune_trivial=False) # the core of the MIP-linearization print(cpm_cons) cpm_cons = only_positive_bv(cpm_cons, csemap=csemap) # after linearization, rewrite ~bv into 1-bv return cpm_cons - @classmethod - def from_cpmpy(cls, model: cp.Model) -> MPS: + @staticmethod + def _transform_objective(cpm_obj: cp.Expression, csemap: dict) -> cp.Expression: + """ + Transforms a CPMpy expression to a linearised expression, compatible with the MPS format. + """ + + # transform objective + obj, safe_cons = safen_objective(cpm_obj) + obj, decomp_cons = decompose_objective(obj, csemap=csemap) + obj, flat_cons = flatten_objective(obj, csemap=csemap) + obj = only_positive_bv_wsum(obj) # remove negboolviews + + additional_constraints = (safe_cons + decomp_cons + flat_cons) + return obj, additional_constraints + + + @staticmethod + def from_cpmpy(model: cp.Model) -> MPS: """· Converts a CPMpy model to an MPS object. @@ -230,15 +247,35 @@ def from_cpmpy(cls, model: cp.Model) -> MPS: for c in cpm_expr: print(c) csemap = dict() - cpm_cons = cls._transform(cpm_expr, csemap=csemap) + cpm_cons = MPS._transform(cpm_expr, csemap=csemap) for c in cpm_cons: print(c) mps_obj = MPS() + + + # --------------------------------- Objective -------------------------------- # + + objective, additional_constraints = MPS._transform_objective(model.objective_, csemap=csemap) + + objective_name = ('min' if model.minimize else 'max') + 'obj' + mps_obj.minimize = model.minimize + mps_obj.set_constraint_type(objective_name, ConstraintType.NON_CONSTRAINING) + if objective.name == "wsum": + weights, variables = objective.args + for weight, variable in zip(weights, variables): + mps_obj.update_column(objective_name, variable.name, weight) + elif objective.name == "sum": + variables_with_weights = objective.args + weights, variables = zip[tuple[Any, ...]](*[(a.args[0], a.args[1]) if isinstance(a, cp.Operator) and a.name == "mul" and len(a.args) == 2 else (1, a) for a in variables_with_weights]) + for weight, variable in zip(weights, variables): + mps_obj.update_column(objective_name, variable.name, weight) + else: + raise ValueError(f"Invalid constraint type: {type(objective)}") # -------------------------------- Constraints ------------------------------- # - for i, cpm_con in enumerate(cpm_cons): + for i, cpm_con in enumerate(cpm_cons + additional_constraints): if isinstance(cpm_con, cp.expressions.core.Comparison): # Comparison type if cpm_con.name == "==": @@ -252,12 +289,16 @@ def from_cpmpy(cls, model: cp.Model) -> MPS: # LHS if cpm_con.args[0].name == "wsum": - weights, variables = cpm_con.args + weights, variables = cpm_con.args[0].args for weight, variable in zip(weights, variables): mps_obj.update_column(f'c{i}', variable.name, weight) elif cpm_con.args[0].name == "sum": - variables_with_weights = cpm_con.args - weights, variables = zip[tuple[Any, ...]](*[(a.args[0], a.args[1]) if isinstance(a, cp.Operator) and a.name == "mul" and len(a.args) == 2 else (1, a) for a in variables_with_weights]) + print(cpm_con) + variables_with_weights = cpm_con.args[0].args + weights, variables = tuple(zip(*[(a.args[0], a.args[1]) if isinstance(a, cp.expressions.core.Operator) and a.name == "mul" and len(a.args) == 2 else (1, a) for a in variables_with_weights])) + print("----") + print(weights, variables) + print(type(weights), type(variables)) for weight, variable in zip(weights, variables): mps_obj.update_column(f'c{i}', variable.name, weight) else: @@ -273,132 +314,127 @@ def from_cpmpy(cls, model: cp.Model) -> MPS: variables = get_variables(cpm_cons) for variable in variables: - lb, up = variable.get_bounds() - mps_obj.update_bounds(variable.name, "LI", lb) - mps_obj.update_bounds(variable.name, "UI", up) - - # --------------------------------- Objective -------------------------------- # + lb, ub = variable.get_bounds() + if lb == ub: + mps_obj.update_bounds(variable.name, "FX", lb) + else: + mps_obj.update_bounds(variable.name, "LI", lb) + mps_obj.update_bounds(variable.name, "UI", ub) - objective = cls._transform(model.objective, csemap=csemap) - objective_name = 'min' if model.minimize else 'max' + 'obj' - mps_obj.minimize = model.minimize - mps_obj.set_constraint_type(objective_name, ConstraintType.NON_CONSTRAINING) - if objective.name == "wsum": - weights, variables = objective.args - for weight, variable in zip(weights, variables): - mps_obj.update_column(objective_name, variable.name, weight) - elif objective.name == "sum": - variables_with_weights = objective.args - weights, variables = zip[tuple[Any, ...]](*[(a.args[0], a.args[1]) if isinstance(a, cp.Operator) and a.name == "mul" and len(a.args) == 2 else (1, a) for a in variables_with_weights]) - for weight, variable in zip(weights, variables): - mps_obj.update_column(objective_name, variable.name, weight) - else: - raise ValueError(f"Invalid constraint type: {type(objective)}") + # ------------------------------------- - ------------------------------------ # return mps_obj - @classmethod - def _format_space(cls, string:str, space:Optional[int]=None, leading:int=0) -> str: + @staticmethod + def _format_space(string:str, space:Optional[int]=None, leading:int=0) -> str: + if isinstance(string, int): + string = str(string) if space is None: space=len(string) - if len(string) < space: + if len(string) <= space: return f"{'':<{leading}}{string:<{space}}" else: raise ValueError(f"String {string} is longer than {space} characters") - @classmethod - def _format_line(cls, strings, spaces, format:str, leading:int=0) -> str: + @staticmethod + def _format_line(strings, spaces, format:str, leading:int=0) -> str: if format == "fixed": - line = cls._format_string(strings[0], spaces[0], leading=leading) + line = MPS._format_space(strings[0], spaces[0], leading=leading) if len(strings) > 1: - line +=''.join([cls._format_string(string,space) for (string,space) in zip(strings[1:], spaces[1:])]) + line +=''.join([MPS._format_space(string,space) for (string,space) in zip(strings[1:], spaces[1:])]) return line elif format == "free": - return cls._format_space('', leading) + ' '.join(val for pair in zip(strings, spaces) for val in pair) + return MPS._format_space('', leading) + ' '.join(val for pair in zip(strings, spaces) for val in pair) else: raise ValueError(f"Invalid format: {format}") - @classmethod - def _write_name(cls, name, format:str) -> str: - return cls._format_line(('Name',), (14,), format=format) - - @classmethod - def _write_objective(cls, minimize:bool, format:str) -> str: - return cls._format_line(('N', f"{'min' if minimize else 'max'}obj"), (4, None), leading=1) - - @classmethod - def _write_row(cls, row_name:str, constraint_type: ConstraintType, format:str) -> str: - return cls._format_line((constraint_type.value, row_name), (4, None), leading=1, format=format) + @staticmethod + def _write_name(name, format:str) -> str: + print(name) + return MPS._format_line(('NAME', name), (14,None), format=format) + + @staticmethod + def _write_objective(minimize:bool, format:str) -> str: + return MPS._format_line(('N', f"{'min' if minimize else 'max'}obj"), (4, None), leading=1, format=format) + + @staticmethod + def _write_row(row_name:str, constraint_type: ConstraintType, format:str) -> str: + return MPS._format_line((constraint_type.value, row_name), (3, None), leading=1, format=format) - @classmethod - def _write_opening_marker(cls, format:str): - return cls._format_line(('MARK0000', 'MARKER', 'INTORG'), (10, 20, None), leading=4, format=format) + @staticmethod + def _write_opening_marker(format:str): + return MPS._format_line(('MARK0000', 'MARKER', 'INTORG'), (10, 21, None), leading=4, format=format) - @classmethod - def _write_closing_marker(cls, format): - return cls._format_line(('MARK0001', 'MARKER', 'INTEND'), (10, 20, None), leading=4, format=format) + @staticmethod + def _write_closing_marker(format): + return MPS._format_line(('MARK0001', 'MARKER', 'INTEND'), (10, 21, None), leading=4, format=format) - @classmethod - def _write_column(cls, column_name:str, variables_with_coefficients:List[Tuple[str, int]], format:str) -> str: + @staticmethod + def _write_column(column_name:str, variables_with_coefficients:List[Tuple[str, int]], format:str) -> str: for a,b in zip(variables_with_coefficients[::2],variables_with_coefficients[1::2]): - yield cls._format_line((column_name, a[0], a[1], b[0], b[1]), (10, 10, 5, 10, 5), leading=4, format=format) + yield MPS._format_line((column_name, a[0], a[1], b[0], b[1]), (10, 21, 4, 21, None), leading=4, format=format) if len(variables_with_coefficients) % 2 != 0: - yield cls._format_line((column_name, variables_with_coefficients[-1][0], variables_with_coefficients[-1][1]), (10, 10, 5), leading=4, format=format) + yield MPS._format_line((column_name, variables_with_coefficients[-1][0], variables_with_coefficients[-1][1]), (10, 21, None), leading=4, format=format) - @classmethod - def _write_rhs(cls, variables_with_coefficients:List[Tuple[str, int]], format:str) -> str: + @staticmethod + def _write_rhs(variables_with_coefficients:List[Tuple[str, int]], format:str) -> str: for a,b in zip(variables_with_coefficients[::2],variables_with_coefficients[1::2]): - yield cls._format_line(('rhs', a[0], a[1], b[0], b[1]), (10, 21, 5, 21, 5), leading=4, format=format) + yield MPS._format_line(('rhs', a[0], a[1], b[0], b[1]), (10, 21, 4, 21, None), leading=4, format=format) if len(variables_with_coefficients) % 2 != 0: - yield cls._format_line(('rhs', variables_with_coefficients[-1][0], variables_with_coefficients[-1][1]), (10, 21, 5), leading=4, format=format) + yield MPS._format_line(('rhs', variables_with_coefficients[-1][0], variables_with_coefficients[-1][1]), (10, 21, None), leading=4, format=format) - def write_mps(self, file_path: Optional[str] = None, format: str = "fixed"): + def write_mps(self, file_path: Optional[str] = None, name:Optional[str]=None, format: str = "fixed"): mps_string = [] if format == "fixed": # Name - mps_string.append(self._write_name(self._metadata['name'], format=format)) + print(name, type(name)) + name = name if name is not None else str(file_path).split(os.sep)[-1].split(".")[0].capitalize() if file_path is not None else "CPMPYMODEL" + mps_string.append(self._write_name(name, format=format)) # Rows mps_string.append("ROWS") - mps_string.append(self._write_objective(self.minimize, format=format)) + # mps_string.append(self._write_objective(self.minimize, format=format)) - for row_name, constraint_type in self._row_map.keys(): - mps_string.append(self._write_row(row_name, constraint_type)) + for row_name, constraint_type in self._row_map.items(): + mps_string.append(self._write_row(row_name, constraint_type, format=format)) # Columns mps_string.append("COLUMNS") - mps_string.append(self._write_opening_marker()) + mps_string.append(self._write_opening_marker(format=format)) for column_name, column_rows in self._A_matrix.items(): - for line in self._write_column(column_name, zip(column_rows.keys(), column_rows.values()), format=format): + for line in self._write_column(column_name, list(zip(column_rows.keys(), column_rows.values())), format=format): mps_string.append(line) - mps_string.append(self._write_closing_marker()) + mps_string.append(self._write_closing_marker(format=format)) # RHS mps_string.append("RHS") - for line in self._write_rhs(zip(self._rhs_map.keys(), self._rhs_map.values()), format=format): + for line in self._write_rhs(list(zip(self._rhs_map.keys(), self._rhs_map.values())), format=format): mps_string.append(line) # Bounds mps_string.append("BOUNDS") - for row_name in self._row_map.keys(): - variable_type = self._type_map[row_name] + print(self._A_matrix) + for column_name in self._A_matrix.keys(): + if column_name == 'minobj' or column_name == 'maxobj': + continue + variable_type = self._type_map[column_name] if variable_type == VariableType.FLOATING_POINT: - if row_name in self._lb_map: - mps_string.append(self._format_line(('LO', 'bnd', row_name, self._lb_map.get(row_name, 0)), (3, 10, 21, None), leading=1, format=format)) - if row_name in self._ub_map: - mps_string.append(self._format_line(('UP', 'bnd', row_name, self._ub_map[row_name]), (3, 10, 21, None), leading=1, format=format)) + if column_name in self._lb_map: + mps_string.append(self._format_line(('LO', 'bnd', column_name, self._lb_map.get(column_name, 0)), (3, 10, 21, None), leading=1, format=format)) + if column_name in self._ub_map: + mps_string.append(self._format_line(('UP', 'bnd', column_name, self._ub_map[column_name]), (3, 10, 21, None), leading=1, format=format)) elif variable_type == VariableType.INTEGER: - if row_name in self._lb_map: - mps_string.append(self._format_line(('LI', 'bnd', row_name, self._lb_map.get(row_name, 0)), (3, 10, 21, None), leading=1, format=format)) - if row_name in self._ub_map: - mps_string.append(self._format_line(('UI', 'bnd', row_name, self._ub_map[row_name]), (3, 10, 21, None), leading=1, format=format)) + if column_name in self._lb_map: + mps_string.append(self._format_line(('LI', 'bnd', column_name, self._lb_map.get(column_name, 0)), (3, 10, 21, None), leading=1, format=format)) + if column_name in self._ub_map: + mps_string.append(self._format_line(('UI', 'bnd', column_name, self._ub_map[column_name]), (3, 10, 21, None), leading=1, format=format)) elif variable_type == VariableType.BINARY: - mps_string.append(self._format_line(('BV', 'bnd', row_name, self._lb_map.get(row_name, 0)), (3, 10, 21, None), leading=1, format=format)) + mps_string.append(self._format_line(('BV', 'bnd', column_name, self._lb_map.get(column_name, 0)), (3, 10, 21, None), leading=1, format=format)) elif variable_type == VariableType.CONSTANT: - mps_string.append(self._format_line(('FX', 'bnd', row_name, self._lb_map[row_name]), (3, 10, 21, None), leading=1, format=format)) + mps_string.append(self._format_line(('FX', 'bnd', column_name, self._lb_map[column_name]), (3, 10, 21, None), leading=1, format=format)) else: - raise ValueError(f"Invalid variable type: {variable_type} for variable: {row_name}") + raise ValueError(f"Invalid variable type: {variable_type} for variable: {column_name}") # End mps_string.append("ENDATA") @@ -446,8 +482,8 @@ def update_column(self, column_name: str, row_name: str, row_coeff: str): Updates the A matrix. Arguments: - column_name (str): The name of the column. - row_name (str): The name of the row. + column_name (str): The name of the column. constraint name + row_name (str): The name of the row. variable name row_coeff (str): The coefficient of the row. """ if self._intorg: @@ -457,7 +493,7 @@ def update_column(self, column_name: str, row_name: str, row_coeff: str): row_coeff = int(row_coeff) else: raise ValueError(f"Floating point variables are not supported: {row_coeff}") - self._A_matrix[column_name] = self._A_matrix.get(column_name, {}) | {row_name: row_coeff} + self._A_matrix[row_name] = self._A_matrix.get(row_name, {}) | {column_name: row_coeff} def update_rhs(self, row_name: str, row_coeff: str): """ @@ -548,8 +584,8 @@ def invert_A_matrix(self): inverted_A_matrix[row_name] = inverted_A_matrix.get(row_name, {}) | {column_name: row_coeff} return inverted_A_matrix - @classmethod - def _read_line(cls, line, starts:List[int], format:str, required:Optional[List[bool]]=None) -> List: + @staticmethod + def _read_line(line, starts:List[int], format:str, required:Optional[List[bool]]=None) -> List: if required is not None: for i, (s, r) in enumerate(zip(starts, required)): if s >= len(line): @@ -591,7 +627,7 @@ def _parse_mps(f: TextIO, format: str = "fixed", **kwargs) -> MPS: print(line) if line.startswith("NAME"): - mps_obj._metadata["name"] = mps_obj._read_line(line, (15,), format=format) + mps_obj._metadata["name"] = mps_obj._read_line(line, (14,), format=format)[0] i += 1 line = lines[i] elif line.startswith("OBJSENSE"): # optional, not part of core specification @@ -621,14 +657,14 @@ def _parse_mps(f: TextIO, format: str = "fixed", **kwargs) -> MPS: elif line.startswith("COLUMNS"): i += 1 line = lines[i] - while i < len(lines) and (line[0] == " " or line[0] == "*"): - if len(line) >= 32 and line[14:22] == "'MARKER'": + while i < len(lines) and (line[0] == " " or line[0] == "*"): + if len(line) >= 32 and line[14:20] == "MARKER": mps_obj.set_marker(line[24:34]) else: column_name, row_name, row_coeff, row2_name, row2_coeff = mps_obj._read_line(line, (4, 14, 35, 39, 60), required=(True, True, True, False, False), format=format) - mps_obj.update_column(column_name, row_name, row_coeff) + mps_obj.update_column(row_name, column_name, row_coeff) if row2_name is not None: - mps_obj.update_column(column_name, row2_name, row2_coeff) + mps_obj.update_column(row2_name, column_name, row2_coeff) i += 1 line = lines[i] elif line.startswith("RHS"): @@ -715,6 +751,8 @@ def write_mps(model: cp.Model, file_path: Optional[str] = None, format: str = "f str: The MPS string. """ mps_obj = MPS.from_cpmpy(model) + print("======") + print(mps_obj._A_matrix) mps_string = mps_obj.write_mps(file_path, format=format) return mps_string From 73ea3dea619e7c39b3d08456fcf4d9f0760925ca Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 22 Jan 2026 14:27:56 +0100 Subject: [PATCH 065/152] Add basic tests --- tests/test_tool_mps.py | 90 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 tests/test_tool_mps.py diff --git a/tests/test_tool_mps.py b/tests/test_tool_mps.py new file mode 100644 index 000000000..d2b6dc97e --- /dev/null +++ b/tests/test_tool_mps.py @@ -0,0 +1,90 @@ + +import pytest +import unittest +import tempfile +import os +import cpmpy as cp +from cpmpy.tools.mps import read_mps, write_mps +from cpmpy.transformations.get_variables import get_variables +from cpmpy.tools.mps.parser import MPS + +class MPSTool(unittest.TestCase): + + mps = """\ +NAME CPMPYMODEL +ROWS + N minobj + L c0 + G c1 + E c2 +COLUMNS + MARK0000 MARKER INTORG + XONE minobj 1 c0 1 + XONE c1 1 + YTWO minobj 4 c0 1 + YTWO c2 -1 + ZTHREE minobj 9 c1 1 + ZTHREE c2 1 + MARK0001 MARKER INTEND +RHS + rhs c0 5 c1 10 + rhs c2 7 +BOUNDS + LI bnd XONE 0 + UI bnd XONE 4 + LI bnd YTWO -1 + UI bnd YTWO 1 + FX bnd ZTHREE 3 +ENDATA\ +""" + def setUp(self) -> None: + self.tmpfile = tempfile.NamedTemporaryFile(mode='w', delete=False) + + def tearDown(self) -> None: + self.tmpfile.close() + os.remove(self.tmpfile.name) + + def test_read_mps(self): + + model = read_mps(self.mps, model_constants=True, filter_zeros=False) + + # 1) test variables + variables = get_variables(model.constraints) + for variable in variables: + if variable.name == "XONE": + self.assertEqual(variable.lb, 0) + self.assertEqual(variable.ub, 4) + elif variable.name == "YTWO": + self.assertEqual(variable.lb, -1) + self.assertEqual(variable.ub, 1) + elif variable.name == "ZTHREE": + self.assertEqual(variable.lb, 3) + self.assertEqual(variable.ub, 3) + else: + self.fail(f"Unexpected variable: {variable.name}") + + # 2) test objective + assert str(model.objective_) == str(cp.sum(cp.cpm_array([1, 4, 9])*cp.cpm_array([cp.intvar(0, 4, name="XONE"), cp.intvar(-1, 1, name="YTWO"), cp.intvar(3, 3, name="ZTHREE")]))) + + # 3) test constraints + assert str(model.constraints[0]) == str(cp.intvar(0, 4, name="XONE") + cp.intvar(-1, 1, name="YTWO") <= 5) + assert str(model.constraints[1]) == str(cp.intvar(0, 4, name="XONE") + cp.intvar(3, 3, name="ZTHREE") >= 10) + assert str(model.constraints[2]) == str(cp.sum(cp.cpm_array([-1, 1])*cp.cpm_array([cp.intvar(-1, 1, name="YTWO"), cp.intvar(3, 3, name="ZTHREE")])) == 7) + + + def test_write_mps(self): + + + + + model = read_mps(self.mps, model_constants=True, filter_zeros=False) + print(model) + + # mps_obj = MPS().from_cpmpy(model) + # print(mps_obj) + + + + mps = write_mps(model) + assert mps == self.mps + From 5b32da1e0a5ea908df735e94358c6a2adac4bed6 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 10:40:47 +0100 Subject: [PATCH 066/152] Add SCIP as reader/writer tool --- cpmpy/tools/mps/parser.py | 750 +------------------------------------ cpmpy/tools/scip/parser.py | 565 ++++++++++++++++++++++++++++ 2 files changed, 575 insertions(+), 740 deletions(-) create mode 100644 cpmpy/tools/scip/parser.py diff --git a/cpmpy/tools/mps/parser.py b/cpmpy/tools/mps/parser.py index 0983ee139..fef0e87dd 100644 --- a/cpmpy/tools/mps/parser.py +++ b/cpmpy/tools/mps/parser.py @@ -1,6 +1,7 @@ """ -Parser for the MPS format. +MPS parser. +This file implements helper functions for reading and writing MPS-formatted LP/MIP models. ================= List of functions @@ -9,752 +10,21 @@ .. autosummary:: :nosignatures: - read_mps + read_mps write_mps - -======================== -List of helper functions -======================== - -.. autosummary:: - :nosignatures: - - _parse_mps - _load_mps """ -from __future__ import annotations +from typing import Optional, Union import os -import cpmpy as cp -import numpy as np -from io import StringIO -from typing import Any, List, Optional, TextIO, Tuple, Union -from enum import Enum - -from cpmpy.transformations.comparison import only_numexpr_equality -from cpmpy.transformations.decompose_global import decompose_in_tree, decompose_objective -from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective -from cpmpy.transformations.get_variables import get_variables -from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv, only_positive_bv_wsum -from cpmpy.transformations.normalize import toplevel_list -from cpmpy.transformations.reification import only_implies, reify_rewrite, only_bv_reifies -from cpmpy.transformations.safening import no_partial_functions, safen_objective - - -class ConstraintType(Enum): - EQUAL = "E" # == - GREATER_THAN = "G" # > - LESS_THAN = "L" # < - NON_CONSTRAINING = "N" # objective - -class VariableType(Enum): - INTEGER = "I" - CONTINUOUS = "C" # not supported - FLOATING_POINT = "F" # not supported - BINARY = "B" - FREE = "F" # not supported - CONSTANT = "X" # only integers (for now float constants not supported, even in objective function) - -def _get_constraint_type(constraint_type: str) -> ConstraintType: - """ - Gets the constraint type from a string. - - Arguments: - constraint_type (str): The constraint type string. - - Returns: - ConstraintType: The constraint type. - """ - if constraint_type == "E": - return ConstraintType.EQUAL - elif constraint_type == "G": - return ConstraintType.GREATER_THAN - elif constraint_type == "L": - return ConstraintType.LESS_THAN - elif constraint_type == "N": - return ConstraintType.NON_CONSTRAINING - else: - raise ValueError(f"Invalid constraint type: {constraint_type}") - -class MPS: - - def __init__(self, assume_interger_variables:bool=True): - """ - Initializes the MPS object. - - Arguments: - assume_interger_variables (bool): Whether to assume integer variables. Default is True. - If True, floating point variables will be converted to integer variables. - If False, floating point variables will be kept as floating point variables - and an exception will be raised (cpmpy does not support floating point decision variables) - """ - self.ASSUME_INTEGER_VARIABLES = assume_interger_variables - self._metadata = dict() # metadata on the MPS instance - self._row_map = dict() # maps constraint names to types of constraint (ConstraintType) - self.objective = None # name of the expression which represents the objective - self.minimize = True # direction of optimisation - self._A_matrix = {} # A matrix (variable x constraint) - self._rhs_map = dict() # right hand side of the expressions, maps expression name to its rhs - self._lb_map = dict() # lower bounds of the variables, maps variable name to its lb - self._ub_map = dict() # upper bounds of the variables, maps variable name to its ub - self._type_map = dict() # for each variable name, stores the type of variable it represents (VariableType) - self._intorg = False # state management for the INTORG marker (in COLUMNS section) - - @property - def metadata(self) -> dict: - """ - Returns the metadata of the MPS instance. - - Returns: - dict: The metadata of the MPS instance. - """ - return self._metadata - - def _get_bounds(self, variable_name:str) -> Tuple: - lb = self._lb_map.get(variable_name, 0) - if variable_name not in self._ub_map: - raise ValueError(f"Upper bound not found for variable: {variable_name}. CPMpy does not support unbounded variables.") - ub = self._ub_map[variable_name] - return lb, ub - - - def to_cpmpy(self, model_constants:bool=False, filter_zeros:bool=True) -> cp.Model: - """ - Converts the MPS instance to a CPMpy model. - - Returns: - cp.Model: The CPMpy model. - """ - - _var_map = dict() - - def _get_variable(variable_name: str): - if variable_name not in _var_map: - - type = self._type_map.get(variable_name, VariableType.FREE) - if type == VariableType.INTEGER: - _var_map[variable_name] = cp.intvar(name=variable_name, lb=self._get_bounds(variable_name)[0], ub=self._get_bounds(variable_name)[1]) - elif type == VariableType.FLOATING_POINT: - if self.ASSUME_INTEGER_VARIABLES: - _var_map[variable_name] = cp.intvar(name=variable_name, lb=int(self._get_bounds(variable_name)[0]), ub=int(self._get_bounds(variable_name)[1])) - else: - raise ValueError(f"Floating point variables are not supported: {variable_name}") - elif type == VariableType.BINARY: - _var_map[variable_name] = cp.boolvar(name=variable_name) - elif type == VariableType.CONSTANT: - if model_constants: - _var_map[variable_name] = cp.intvar(name=variable_name, lb=self._get_bounds(variable_name)[0], ub=self._get_bounds(variable_name)[0]) - else: - _var_map[variable_name] = self._get_bounds(variable_name)[0] - else: - raise ValueError(f"Invalid variable type: {type} for variable: {variable_name}") - - return _var_map[variable_name] - - def _get_variables(variable_names: list[str]): - return np.array([_get_variable(variable_name) for variable_name in variable_names]) - - model = cp.Model() - - inverted_A_matrix = self.invert_A_matrix() - - for constraint_name, constraint_type in self._row_map.items(): - print(constraint_name, constraint_type) - if constraint_type == ConstraintType.NON_CONSTRAINING: - print(inverted_A_matrix) - obj_array = np.array(list(inverted_A_matrix[constraint_name].values())) * _get_variables(list(inverted_A_matrix[constraint_name].keys())) - if filter_zeros: - obj_array = [o for o in obj_array if not (isinstance(o, (int, np.integer)) and o == 0)] - objective = cp.sum(obj_array) - if self.minimize: - model.minimize(objective) - else: - model.maximize(objective) - - else: - if constraint_type == ConstraintType.EQUAL: - lhs = np.array(list(inverted_A_matrix[constraint_name].values())) * _get_variables(list(inverted_A_matrix[constraint_name].keys())) - if filter_zeros: - lhs = [l for l in lhs if not (isinstance(l, (int, np.integer)) and l == 0)] - model += cp.sum(lhs) == self._rhs_map[constraint_name] - elif constraint_type == ConstraintType.GREATER_THAN: - lhs = np.array(list(inverted_A_matrix[constraint_name].values())) * _get_variables(list(inverted_A_matrix[constraint_name].keys())) - if filter_zeros: - lhs = [l for l in lhs if not (isinstance(l, (int, np.integer)) and l == 0)] - model += cp.sum(lhs) >= self._rhs_map[constraint_name] - elif constraint_type == ConstraintType.LESS_THAN: - lhs = cp.cpm_array(list(inverted_A_matrix[constraint_name].values())) * _get_variables(list(inverted_A_matrix[constraint_name].keys())) - if filter_zeros: - lhs = [l for l in lhs if not (isinstance(l, int) and l.value != 0)] - model += cp.sum(list(lhs)) <= self._rhs_map[constraint_name] - else: - raise ValueError(f"Invalid constraint type: {constraint_type} for constraint: {constraint_name}") - - return model - - @staticmethod - def _transform(cpm_cons: list[cp.Expression], csemap: dict) -> list[cp.Expression]: - """ - Transforms a list of CPMpy expressions to a list of linearised expressions, compatible with the MPS format. - - Arguments: - cpm_cons (list[cp.Expression]): The list of CPMpy expressions to transform. - csemap (dict): The context-sensitive evaluation map. - """ - # TODO: for now just straight copy from CPM_gurobi - cpm_cons = toplevel_list(cpm_cons) - cpm_cons = no_partial_functions(cpm_cons, safen_toplevel={"mod", "div"}) # linearize expects safe exprs - supported = {"min", "max", "abs", "alldifferent"} # alldiff has a specialized MIP decomp in linearize - cpm_cons = decompose_in_tree(cpm_cons, supported, csemap=csemap) - cpm_cons = flatten_constraint(cpm_cons, csemap=csemap) # flat normal form - cpm_cons = reify_rewrite(cpm_cons, supported=frozenset(['sum', 'wsum']), csemap=csemap) # constraints that support reification - cpm_cons = only_numexpr_equality(cpm_cons, supported=frozenset(["sum", "wsum", "sub"]), csemap=csemap) # supports >, <, != - cpm_cons = only_bv_reifies(cpm_cons, csemap=csemap) - cpm_cons = only_implies(cpm_cons, csemap=csemap) # anything that can create full reif should go above... - print(cpm_cons) - # gurobi does not round towards zero, so no 'div' in supported set: https://github.com/CPMpy/cpmpy/pull/593#issuecomment-2786707188 - cpm_cons = linearize_constraint(cpm_cons, supported=frozenset({"sum", "wsum", "sub"}), csemap=csemap, prune_trivial=False) # the core of the MIP-linearization - print(cpm_cons) - cpm_cons = only_positive_bv(cpm_cons, csemap=csemap) # after linearization, rewrite ~bv into 1-bv - return cpm_cons - - @staticmethod - def _transform_objective(cpm_obj: cp.Expression, csemap: dict) -> cp.Expression: - """ - Transforms a CPMpy expression to a linearised expression, compatible with the MPS format. - """ - - # transform objective - obj, safe_cons = safen_objective(cpm_obj) - obj, decomp_cons = decompose_objective(obj, csemap=csemap) - obj, flat_cons = flatten_objective(obj, csemap=csemap) - obj = only_positive_bv_wsum(obj) # remove negboolviews - - additional_constraints = (safe_cons + decomp_cons + flat_cons) - return obj, additional_constraints - - - @staticmethod - def from_cpmpy(model: cp.Model) -> MPS: - """· - Converts a CPMpy model to an MPS object. - - Arguments: - model (cp.Model): The CPMpy model to convert. - """ - cpm_expr = model.constraints - for c in cpm_expr: - print(c) - csemap = dict() - cpm_cons = MPS._transform(cpm_expr, csemap=csemap) - for c in cpm_cons: - print(c) - - mps_obj = MPS() - - - # --------------------------------- Objective -------------------------------- # - - objective, additional_constraints = MPS._transform_objective(model.objective_, csemap=csemap) - - objective_name = ('min' if model.minimize else 'max') + 'obj' - mps_obj.minimize = model.minimize - mps_obj.set_constraint_type(objective_name, ConstraintType.NON_CONSTRAINING) - if objective.name == "wsum": - weights, variables = objective.args - for weight, variable in zip(weights, variables): - mps_obj.update_column(objective_name, variable.name, weight) - elif objective.name == "sum": - variables_with_weights = objective.args - weights, variables = zip[tuple[Any, ...]](*[(a.args[0], a.args[1]) if isinstance(a, cp.Operator) and a.name == "mul" and len(a.args) == 2 else (1, a) for a in variables_with_weights]) - for weight, variable in zip(weights, variables): - mps_obj.update_column(objective_name, variable.name, weight) - else: - raise ValueError(f"Invalid constraint type: {type(objective)}") - - # -------------------------------- Constraints ------------------------------- # - - for i, cpm_con in enumerate(cpm_cons + additional_constraints): - if isinstance(cpm_con, cp.expressions.core.Comparison): - # Comparison type - if cpm_con.name == "==": - mps_obj.set_constraint_type(f'c{i}', ConstraintType.EQUAL) - elif cpm_con.name == ">=": - mps_obj.set_constraint_type(f'c{i}', ConstraintType.GREATER_THAN) - elif cpm_con.name == "<=": - mps_obj.set_constraint_type(f'c{i}', ConstraintType.LESS_THAN) - else: - raise ValueError(f"Invalid comparison operator: {cpm_con.name}") - - # LHS - if cpm_con.args[0].name == "wsum": - weights, variables = cpm_con.args[0].args - for weight, variable in zip(weights, variables): - mps_obj.update_column(f'c{i}', variable.name, weight) - elif cpm_con.args[0].name == "sum": - print(cpm_con) - variables_with_weights = cpm_con.args[0].args - weights, variables = tuple(zip(*[(a.args[0], a.args[1]) if isinstance(a, cp.expressions.core.Operator) and a.name == "mul" and len(a.args) == 2 else (1, a) for a in variables_with_weights])) - print("----") - print(weights, variables) - print(type(weights), type(variables)) - for weight, variable in zip(weights, variables): - mps_obj.update_column(f'c{i}', variable.name, weight) - else: - raise ValueError(f"Invalid constraint type: {type(cpm_con.args[0])}") - - # RHS - mps_obj.update_rhs(f'c{i}', cpm_con.args[1]) - - else: - raise ValueError(f"Invalid constraint type: {type(cpm_con)}") - - # --------------------------------- Variables -------------------------------- # - - variables = get_variables(cpm_cons) - for variable in variables: - lb, ub = variable.get_bounds() - if lb == ub: - mps_obj.update_bounds(variable.name, "FX", lb) - else: - mps_obj.update_bounds(variable.name, "LI", lb) - mps_obj.update_bounds(variable.name, "UI", ub) - - - # ------------------------------------- - ------------------------------------ # - - return mps_obj - - @staticmethod - def _format_space(string:str, space:Optional[int]=None, leading:int=0) -> str: - if isinstance(string, int): - string = str(string) - if space is None: - space=len(string) - if len(string) <= space: - return f"{'':<{leading}}{string:<{space}}" - else: - raise ValueError(f"String {string} is longer than {space} characters") - - @staticmethod - def _format_line(strings, spaces, format:str, leading:int=0) -> str: - if format == "fixed": - line = MPS._format_space(strings[0], spaces[0], leading=leading) - if len(strings) > 1: - line +=''.join([MPS._format_space(string,space) for (string,space) in zip(strings[1:], spaces[1:])]) - return line - elif format == "free": - return MPS._format_space('', leading) + ' '.join(val for pair in zip(strings, spaces) for val in pair) - else: - raise ValueError(f"Invalid format: {format}") - - @staticmethod - def _write_name(name, format:str) -> str: - print(name) - return MPS._format_line(('NAME', name), (14,None), format=format) - - @staticmethod - def _write_objective(minimize:bool, format:str) -> str: - return MPS._format_line(('N', f"{'min' if minimize else 'max'}obj"), (4, None), leading=1, format=format) - - @staticmethod - def _write_row(row_name:str, constraint_type: ConstraintType, format:str) -> str: - return MPS._format_line((constraint_type.value, row_name), (3, None), leading=1, format=format) - - @staticmethod - def _write_opening_marker(format:str): - return MPS._format_line(('MARK0000', 'MARKER', 'INTORG'), (10, 21, None), leading=4, format=format) - - @staticmethod - def _write_closing_marker(format): - return MPS._format_line(('MARK0001', 'MARKER', 'INTEND'), (10, 21, None), leading=4, format=format) - - @staticmethod - def _write_column(column_name:str, variables_with_coefficients:List[Tuple[str, int]], format:str) -> str: - for a,b in zip(variables_with_coefficients[::2],variables_with_coefficients[1::2]): - yield MPS._format_line((column_name, a[0], a[1], b[0], b[1]), (10, 21, 4, 21, None), leading=4, format=format) - if len(variables_with_coefficients) % 2 != 0: - yield MPS._format_line((column_name, variables_with_coefficients[-1][0], variables_with_coefficients[-1][1]), (10, 21, None), leading=4, format=format) - - @staticmethod - def _write_rhs(variables_with_coefficients:List[Tuple[str, int]], format:str) -> str: - for a,b in zip(variables_with_coefficients[::2],variables_with_coefficients[1::2]): - yield MPS._format_line(('rhs', a[0], a[1], b[0], b[1]), (10, 21, 4, 21, None), leading=4, format=format) - if len(variables_with_coefficients) % 2 != 0: - yield MPS._format_line(('rhs', variables_with_coefficients[-1][0], variables_with_coefficients[-1][1]), (10, 21, None), leading=4, format=format) - - - def write_mps(self, file_path: Optional[str] = None, name:Optional[str]=None, format: str = "fixed"): - mps_string = [] - - - if format == "fixed": - # Name - print(name, type(name)) - name = name if name is not None else str(file_path).split(os.sep)[-1].split(".")[0].capitalize() if file_path is not None else "CPMPYMODEL" - mps_string.append(self._write_name(name, format=format)) - # Rows - mps_string.append("ROWS") - # mps_string.append(self._write_objective(self.minimize, format=format)) - - for row_name, constraint_type in self._row_map.items(): - mps_string.append(self._write_row(row_name, constraint_type, format=format)) - # Columns - mps_string.append("COLUMNS") - mps_string.append(self._write_opening_marker(format=format)) - for column_name, column_rows in self._A_matrix.items(): - for line in self._write_column(column_name, list(zip(column_rows.keys(), column_rows.values())), format=format): - mps_string.append(line) - mps_string.append(self._write_closing_marker(format=format)) - # RHS - mps_string.append("RHS") - for line in self._write_rhs(list(zip(self._rhs_map.keys(), self._rhs_map.values())), format=format): - mps_string.append(line) - # Bounds - mps_string.append("BOUNDS") - print(self._A_matrix) - for column_name in self._A_matrix.keys(): - if column_name == 'minobj' or column_name == 'maxobj': - continue - variable_type = self._type_map[column_name] - if variable_type == VariableType.FLOATING_POINT: - if column_name in self._lb_map: - mps_string.append(self._format_line(('LO', 'bnd', column_name, self._lb_map.get(column_name, 0)), (3, 10, 21, None), leading=1, format=format)) - if column_name in self._ub_map: - mps_string.append(self._format_line(('UP', 'bnd', column_name, self._ub_map[column_name]), (3, 10, 21, None), leading=1, format=format)) - elif variable_type == VariableType.INTEGER: - if column_name in self._lb_map: - mps_string.append(self._format_line(('LI', 'bnd', column_name, self._lb_map.get(column_name, 0)), (3, 10, 21, None), leading=1, format=format)) - if column_name in self._ub_map: - mps_string.append(self._format_line(('UI', 'bnd', column_name, self._ub_map[column_name]), (3, 10, 21, None), leading=1, format=format)) - elif variable_type == VariableType.BINARY: - mps_string.append(self._format_line(('BV', 'bnd', column_name, self._lb_map.get(column_name, 0)), (3, 10, 21, None), leading=1, format=format)) - elif variable_type == VariableType.CONSTANT: - mps_string.append(self._format_line(('FX', 'bnd', column_name, self._lb_map[column_name]), (3, 10, 21, None), leading=1, format=format)) - else: - raise ValueError(f"Invalid variable type: {variable_type} for variable: {column_name}") - # End - mps_string.append("ENDATA") - - mps_string = "\n".join(mps_string) - - if file_path is not None: - with open(file_path, "w") as f: - f.write(mps_string) - - return mps_string - - def set_objective(self, expression_name: str): - """ - Sets the name of the expression that represents the objective. - - Arguments: - expression_name (str): The name of the expression that represents the objective. - """ - self.objective = expression_name - - def set_constraint_type(self, constraint_name: str, constraint_type: ConstraintType): - """ - Sets the type of a constraint. - - Arguments: - constraint_name (str): The name of the constraint. - constraint_type (ConstraintType): The type of the constraint. - """ - self._row_map[constraint_name] = constraint_type - - def set_marker(self, marker: str): - """ - Sets the marker for the INTORG/INTEND section. - - Arguments: - marker (str): The marker to set. - """ - if "'INTORG'" == marker: - self._intorg = True - elif "'INTEND'" == marker: - self._intorg = False - - def update_column(self, column_name: str, row_name: str, row_coeff: str): - """ - Updates the A matrix. - - Arguments: - column_name (str): The name of the column. constraint name - row_name (str): The name of the row. variable name - row_coeff (str): The coefficient of the row. - """ - if self._intorg: - row_coeff = int(row_coeff) - else: - if self.ASSUME_INTEGER_VARIABLES: - row_coeff = int(row_coeff) - else: - raise ValueError(f"Floating point variables are not supported: {row_coeff}") - self._A_matrix[row_name] = self._A_matrix.get(row_name, {}) | {column_name: row_coeff} - - def update_rhs(self, row_name: str, row_coeff: str): - """ - Updates the right hand side of a constraint. - - Arguments: - row_name (str): The name of the constraint. - row_coeff (str): The right hand side of the constraint. - """ - if self._intorg: - row_coeff = int(row_coeff) - else: - if self.ASSUME_INTEGER_VARIABLES: - row_coeff = int(row_coeff) - elif row_coeff != int(row_coeff): - raise ValueError(f"Floating point variables are not supported: {row_coeff}") - else: - row_coeff = int(row_coeff) - self._rhs_map[row_name] = row_coeff - - def update_bounds(self, row_name: str, type: str, bound_value: str): - """ - Updates the bounds of a variable. - - Arguments: - row_name (str): The name of the variable. - type (str): The type of the bound. - bound_value (str): The value of the bound. - """ - if type == "LO": - self._type_map[row_name] = VariableType.FLOATING_POINT - if self.ASSUME_INTEGER_VARIABLES: - self._lb_map[row_name] = int(bound_value) - else: - if bound_value != int(bound_value): - raise ValueError(f"Floating point bounds are not supported: {bound_value}") - self._lb_map[row_name] = int(bound_value) - elif type == "UP": - self._type_map[row_name] = VariableType.FLOATING_POINT - if self.ASSUME_INTEGER_VARIABLES: - self._ub_map[row_name] = int(bound_value) - else: - if bound_value != int(bound_value): - raise ValueError(f"Floating point bounds are not supported: {bound_value}") - self._ub_map[row_name] = int(bound_value) - elif type == "FX": - self._type_map[row_name] = VariableType.CONSTANT - if bound_value != int(bound_value): - if self.ASSUME_INTEGER_VARIABLES: - bound_value = int(bound_value) - else: - raise ValueError(f"Floating point bounds are not supported: {bound_value}") - self._lb_map[row_name] = int(bound_value) - self._ub_map[row_name] = int(bound_value) - elif type == "BV": - self._type_map[row_name] = VariableType.BINARY - self._lb_map[row_name] = 0 - self._ub_map[row_name] = 1 - elif type == "LI": - self._type_map[row_name] = VariableType.INTEGER - self._lb_map[row_name] = int(bound_value) - elif type == "UI": - self._type_map[row_name] = VariableType.INTEGER - self._ub_map[row_name] = int(bound_value) - elif type == "SC": - pass - elif type == "SI": - pass - elif type == "FR": - pass - elif type == "MI": - pass - elif type == "PL": - pass - else: - raise ValueError(f"Invalid bound type: {type}") - - def invert_A_matrix(self): - """ - Inverts the A matrix, becoming a (constraint x variable) matrix. - - Returns: - dict: The inverted A matrix. - """ - inverted_A_matrix = dict() - for column_name, column_rows in self._A_matrix.items(): - for row_name, row_coeff in column_rows.items(): - inverted_A_matrix[row_name] = inverted_A_matrix.get(row_name, {}) | {column_name: row_coeff} - return inverted_A_matrix - - @staticmethod - def _read_line(line, starts:List[int], format:str, required:Optional[List[bool]]=None) -> List: - if required is not None: - for i, (s, r) in enumerate(zip(starts, required)): - if s >= len(line): - if r: - raise ValueError(f"Required field {i} is missing") - else: - i -= 1 - break - starts = starts[:i+1] - if format == "fixed": - res = [] - for a,b in zip(starts[:], starts[1:]): - res.append(line[a:b].strip()) - res.append(line[starts[-1]:].strip()) - if required is not None: - res += [None]*(len(required)-len(starts)) - return res - elif format == "free": - return line.split() + [None]*(len(required)-len(starts)) if required is not None else line.split() - else: - raise ValueError(f"Invalid format: {format}") - - -def _parse_mps(f: TextIO, format: str = "fixed", **kwargs) -> MPS: - """ - Parses an MPS string and returns an MPS object. - - Arguments: - mps (str): The MPS string to parse. - """ - - mps_obj = MPS() - - lines = f.readlines() - i = 0 - - while i < len(lines): - line = lines[i] - print(line) - - if line.startswith("NAME"): - mps_obj._metadata["name"] = mps_obj._read_line(line, (14,), format=format)[0] - i += 1 - line = lines[i] - elif line.startswith("OBJSENSE"): # optional, not part of core specification - direction = mps_obj._read_line(line, (9,), format=format) - if direction == "MIN": - pass # default is minimize - elif direction == "MAX": - mps_obj.minimize = False - else: - raise ValueError(f"Invalid optimisation direction: {direction}") - i += 1 - elif line.startswith("*"): # comment line - i += 1 - elif line.startswith("ROWS"): # name of constraints - i += 1 - line = lines[i] - while i < len(lines) and (line[0] == " " or line[0] == "*"): - # create mapping of constraint name to constraint type - constraint_type, constraint_name = mps_obj._read_line(line, (1, 4), format=format) - constraint_type = _get_constraint_type(constraint_type.lstrip()) # operators can be in column 2 or 3 - print(constraint_name) - if constraint_type == ConstraintType.NON_CONSTRAINING: - mps_obj.set_objective(constraint_name) - mps_obj.set_constraint_type(constraint_name, constraint_type) - i += 1 - line = lines[i] - elif line.startswith("COLUMNS"): - i += 1 - line = lines[i] - while i < len(lines) and (line[0] == " " or line[0] == "*"): - if len(line) >= 32 and line[14:20] == "MARKER": - mps_obj.set_marker(line[24:34]) - else: - column_name, row_name, row_coeff, row2_name, row2_coeff = mps_obj._read_line(line, (4, 14, 35, 39, 60), required=(True, True, True, False, False), format=format) - mps_obj.update_column(row_name, column_name, row_coeff) - if row2_name is not None: - mps_obj.update_column(row2_name, column_name, row2_coeff) - i += 1 - line = lines[i] - elif line.startswith("RHS"): - i += 1 - line = lines[i] - while i < len(lines) and (line[0] == " " or line[0] == "*"): - - row_name, row_coeff, row2_name, row2_coeff = mps_obj._read_line(line, (14, 24, 39, 49), required=(True, True, False, False), format=format) - mps_obj.update_rhs(row_name, row_coeff) - if row2_name is not None: - mps_obj.update_rhs(row2_name, row2_coeff) - i += 1 - line = lines[i] - elif line.startswith("BOUNDS"): - i += 1 - line = lines[i] - while i < len(lines) and (line[0] == " " or line[0] == "*"): - type, _, row_name, bound_value = mps_obj._read_line(line, (1, 3, 14, 35), required=(True, True, True, False), format=format) - if bound_value is None: - bound_value = 0 - print(line) - print(row_name, type, bound_value) - mps_obj.update_bounds(row_name, type, bound_value) - i += 1 - line = lines[i] - elif line.startswith("ENDATA"): - break - else: - raise ValueError(f"Invalid line: {line}") - i += 1 - - return mps_obj - -def _load_mps(mps_obj: MPS, **kwargs) -> cp.Model: - """ - Loads an MPS object into a CPMpy model. - - Arguments: - mps_obj (MPS): The MPS object to load. - """ - return mps_obj.to_cpmpy(**kwargs) - - -_std_open = open -def read_mps(mps: Union[str, os.PathLike], open=open, format:str="fixed", **kwargs) -> cp.Model: - """ - Parser for MPS format. Reads in an instance and returns its matching CPMpy model. - - Arguments: - mps (str or os.PathLike): - - A file path to a MPS file - - OR a string containing the MPS content directly - open: (callable): - If mps is the path to a file, a callable to "open" that file (default=python standard library's 'open'). - format: (str): - The format of the MPS file. Can be "fixed" or "free". Default is "fixed". - """ - - # If mps is a path to a file -> open file - if isinstance(mps, (str, os.PathLike)) and os.path.exists(mps): - if open is not None: - f = open(mps) - else: - f = _std_open(mps, "rt") - # If mps is a string containing a model -> create a memory-mapped file - else: - f = StringIO(mps) - - - mps_obj = _parse_mps(f, format=format, **kwargs) - model = _load_mps(mps_obj, **kwargs) - return model - - -def write_mps(model: cp.Model, file_path: Optional[str] = None, format: str = "fixed") -> str: - """ - Writes a CPMpy model to an MPS string / file. - - Arguments: - model (cp.Model): The CPMpy model to write. - file_path (Optional[str]): Optional path to the MPS file to write. +import cpmpy as cp +from cpmpy.tools.scip.parser import read_scip - Returns: - str: The MPS string. - """ - mps_obj = MPS.from_cpmpy(model) - print("======") - print(mps_obj._A_matrix) - mps_string = mps_obj.write_mps(file_path, format=format) - return mps_string +def read_mps(mps: Union[str, os.PathLike], open=open, assume_integer:bool=False) -> cp.Model: + return read_scip(mps, open, assume_integer) +def write_mps(model: cp.Model, file_path: Optional[str] = None) -> str: + pass diff --git a/cpmpy/tools/scip/parser.py b/cpmpy/tools/scip/parser.py new file mode 100644 index 000000000..52a5f881b --- /dev/null +++ b/cpmpy/tools/scip/parser.py @@ -0,0 +1,565 @@ +""" +This file implements helper functions for exporting CPMpy models from and to various data +formats supported by the SCIP optimization suite. + +============ +Installation +============ + +The 'pyscipopt' python package must be installed separately through `pip`: + +.. code-block:: console + + $ pip install cpmpy[io.scip] + +=============== +List of classes +=============== + +.. autosummary:: + :nosignatures: + + CPM_scip + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read_scip + write_scip + to_scip + +============== +Module details +============== +""" + + +import math +import os +import tempfile +import numpy as np +import cpmpy as cp +import warnings + +from typing import Union, Optional + +from cpmpy.expressions.core import BoolVal, Comparison, Operator +from cpmpy.expressions.variables import _NumVarImpl, _BoolVarImpl, NegBoolView, _IntVarImpl +from cpmpy.transformations.comparison import only_numexpr_equality +from cpmpy.transformations.decompose_global import decompose_in_tree +from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective +from cpmpy.transformations.get_variables import get_variables +from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv +from cpmpy.transformations.normalize import toplevel_list +from cpmpy.transformations.reification import only_implies, reify_rewrite +from cpmpy.expressions.utils import is_any_list, is_num +from cpmpy.expressions.globalconstraints import DirectConstraint +from cpmpy.expressions.variables import ignore_variable_name_check + + +_std_open = open +def read_scip(fname: Union[str, os.PathLike], open=open, assume_integer:bool=False) -> cp.Model: + """ + Read a SCIP-compatible model from a file and return a CPMpy model. + + Arguments: + fname: The path to the SCIP-compatible file to read. + open: The function to use to open the file. (SCIP does not require this argument, will be ignored) + assume_integer: Whether to assume that all variables are integer. + + Returns: + A CPMpy model. + """ + if not _SCIPWriter.supported(): + raise Exception("SCIP: Install SCIP IO dependencies: cpmpy[io.scip]") + + with ignore_variable_name_check(): + + from pyscipopt import Model + + # Load file into pyscipopt model + scip = Model() + scip.hideOutput() + scip.readProblem(filename=fname) + scip.hideOutput(quiet=False) + + # 1) translate variables + scip_vars = scip.getVars() + var_map = {} + for var in scip_vars: + name = var.name # name of the variable + vtype = var.vtype() # type of the variable + if vtype == "BINARY": + var_map[name] = cp.boolvar(name=name) + elif vtype == "INTEGER": + lb = int(var.getLbOriginal()) + ub = int(var.getUbOriginal()) + var_map[name] = cp.intvar(lb, ub, name=name) + elif vtype == "CONTINUOUS": + if assume_integer: + lb = int(math.ceil(var.getLbOriginal())) + ub = int(math.floor(var.getUbOriginal())) + if lb != var.getLbOriginal() or ub != var.getUbOriginal(): + warnings.warn(f"Continuous variable {name} has non-integer bounds {var.getLbOriginal()} - {var.getUbOriginal()}. CPMpy will assume it is integer.") + var_map[name] = cp.intvar(lb, ub, name=name) + else: + raise ValueError(f"CPMpy does not support continious variables: {name}") + else: + raise ValueError(f"Unsupported variable type: {vtype}") + + + model = cp.Model() + + # 2) translate constraints + scip_cons = scip.getConss() + for cons in scip_cons: + ctype = cons.getConshdlrName() # type of the constraint + + if ctype == "linear": + cons_vars = scip.getConsVars(cons) # variables in the constraint (x) + cons_coeff = scip.getConsVals(cons) # coefficients of the variables (A) + + cpm_vars = [var_map[v.name] for v in cons_vars] # convert to CPMpy variables + cpm_sum = cp.sum(var*coeff for (var,coeff) in zip(cpm_vars, cons_coeff)) # Ax + + lhs = scip.getLhs(cons) # lhs of the constraint + rhs = scip.getRhs(cons) # rhs of the constraint + + # convert to integer bounds + _lhs = int(math.ceil(lhs)) + _rhs = int(math.floor(rhs)) + if _lhs != int(lhs) or _rhs != int(rhs): + if assume_integer: + warnings.warn(f"Constraint {cons.name} has non-integer bounds. CPMpy will assume it is integer.") + else: + raise ValueError(f"Constraint {cons.name} has non-integer bounds. CPMpy does not support non-integer bounds.") + + # add the constraint to the model + model += _lhs <= cpm_sum + model += cpm_sum <= _rhs + + else: + raise ValueError(f"Unsupported constraint type: {ctype}") + + # 3) translate objective + scip_objective = scip.getObjective() + direction = scip.getObjectiveSense() + + n_terms = len(scip_objective.terms) + obj_vars = cp.cpm_array([None]*n_terms) + obj_coeffs = np.zeros(n_terms, dtype=int) + + for i, (term, coeff) in enumerate(scip_objective.terms.items()): # terms is a dictionary mapping terms to coefficients + if len(term.vartuple) > 1: + raise ValueError(f"Unsupported objective term: {term}") # TODO <- assumes linear, support higher-order terms + cpm_var = var_map[term.vartuple[0].name] # TODO <- assumes linear + obj_vars[i] = cpm_var + + _coeff = int(math.floor(coeff)) + if _coeff != int(coeff): + if assume_integer: + warnings.warn(f"Objective term {term} has non-integer coefficient. CPMpy will assume it is integer.") + else: + raise ValueError(f"Objective term {term} has non-integer coefficient. CPMpy does not support non-integer coefficients.") + obj_coeffs[i] = _coeff + + if direction == "minimize": + model.minimize(cp.sum(obj_vars * obj_coeffs)) + elif direction == "maximize": + model.maximize(cp.sum(obj_vars * obj_coeffs)) + else: + raise ValueError(f"Unsupported objective sense: {direction}") + + return model + + + +class _SCIPWriter: + """ + A helper class aiding in translating CPMpy models to SCIP models. + + Borrows a lot of its implementation from the prototype SCIP solver interface from git branch `scip2`. + + TODO: code should be reused once SCIP has been added as a solver backend. + """ + + @staticmethod + def supported(): + # try to import the package + try: + import pyscipopt as scip + return True + except: + return False + + def __init__(self, problem_name: Optional[str] = None): + if not self.supported(): + raise Exception( + "SCIP: Install SCIP IO dependencies: cpmpy[io.scip]") + import pyscipopt as scip + + self.scip_model = scip.Model(problem_name) + + self.user_vars = set() + self._varmap = dict() # maps cpmpy variables to native solver variables + self._csemap = dict() # maps cpmpy expressions to solver expressions + + self._cons_counter = 0 + + def solver_var(self, cpm_var): + """ + Creates solver variable for cpmpy variable + or returns from cache if previously created + """ + if is_num(cpm_var): # shortcut, eases posting constraints + return cpm_var + + # special case, negative-bool-view + # work directly on var inside the view + if isinstance(cpm_var, NegBoolView): + raise Exception("Negative literals should not be part of any equation. See /transformations/linearize for more details") + + # create if it does not exit + if cpm_var not in self._varmap: + if isinstance(cpm_var, _BoolVarImpl): + revar = self.scip_model.addVar(vtype='B', name=cpm_var.name) + elif isinstance(cpm_var, _IntVarImpl): + revar = self.scip_model.addVar(lb=cpm_var.lb, ub=cpm_var.ub, vtype='I', name=cpm_var.name) + else: + raise NotImplementedError("Not a known var {}".format(cpm_var)) + self._varmap[cpm_var] = revar + + # return from cache + return self._varmap[cpm_var] + + + def solver_vars(self, cpm_vars): + """ + Like `solver_var()` but for arbitrary shaped lists/tensors + """ + if is_any_list(cpm_vars): + return [self.solver_vars(v) for v in cpm_vars] + return self.solver_var(cpm_vars) + + def objective(self, expr, minimize=True): + """ + Post the given expression to the solver as objective to minimize/maximize + + 'objective()' can be called multiple times, only the last one is stored + + (technical side note: any constraints created during conversion of the objective + are premanently posted to the solver) + """ + + # make objective function non-nested + (flat_obj, flat_cons) = (flatten_objective(expr)) + self += flat_cons + get_variables(flat_obj, collect=self.user_vars) # add potentially created constraints + + # make objective function or variable and post + obj = self._make_numexpr(flat_obj) + if minimize: + self.scip_model.setObjective(obj, sense='minimize') + else: + self.scip_model.setObjective(obj, sense='maximize') + + + def _make_numexpr(self, cpm_expr): + """ + Turns a numeric CPMpy 'flat' expression into a solver-specific + numeric expression + + Used especially to post an expression as objective function + """ + import pyscipopt as scip + + if is_num(cpm_expr): + return cpm_expr + + # decision variables, check in varmap + if isinstance(cpm_expr, _NumVarImpl): # cp.boolvar is subclass of _NumVarImpl + return self.solver_var(cpm_expr) + + # sum + if cpm_expr.name == "sum": + return scip.quicksum(self.solver_vars(cpm_expr.args)) + if cpm_expr.name == "sub": + a,b = self.solver_vars(cpm_expr.args) + return a - b + # wsum + if cpm_expr.name == "wsum": + return scip.quicksum(w * self.solver_var(var) for w, var in zip(*cpm_expr.args)) + + raise NotImplementedError("scip: Not a known supported numexpr {}".format(cpm_expr)) + + + def transform(self, cpm_expr): + """ + Transform arbitrary CPMpy expressions to constraints the solver supports + + Implemented through chaining multiple solver-independent **transformation functions** from + the `cpmpy/transformations/` directory. + + See the 'Adding a new solver' docs on readthedocs for more information. + + :param cpm_expr: CPMpy expression, or list thereof + :type cpm_expr: Expression or list of Expression + + :return: list of Expression + """ + # apply transformations, then post internally + # expressions have to be linearized to fit in MIP model. See /transformations/linearize + cpm_cons = toplevel_list(cpm_expr) + supported = {"alldifferent"} # alldiff has a specialized MIP decomp in linearize + cpm_cons = decompose_in_tree(cpm_cons, supported) + cpm_cons = flatten_constraint(cpm_cons) # flat normal form + cpm_cons = reify_rewrite(cpm_cons, supported=frozenset(['sum', 'wsum','sub'])) # constraints that support reification + cpm_cons = only_numexpr_equality(cpm_cons, supported=frozenset(["sum", "wsum", "sub"])) # supports >, <, != + cpm_cons = only_implies(cpm_cons) # anything that can create full reif should go above... + cpm_cons = linearize_constraint(cpm_cons, supported=frozenset({"sum", "wsum","sub", "mul", "div"})) # the core of the MIP-linearization + cpm_cons = only_positive_bv(cpm_cons) # after linearization, rewrite ~bv into 1-bv + return cpm_cons + + def _get_constraint_name(self): + name = f"cons_{self._cons_counter}" + self._cons_counter += 1 + return name + + + def add(self, cpm_expr_orig): + """ + Eagerly add a constraint to the underlying solver. + + Any CPMpy expression given is immediately transformed (through `transform()`) + and then posted to the solver in this function. + + This can raise 'NotImplementedError' for any constraint not supported after transformation + + The variables used in expressions given to add are stored as 'user variables'. Those are the only ones + the user knows and cares about (and will be populated with a value after solve). All other variables + are auxiliary variables created by transformations. + + :param cpm_expr: CPMpy expression, or list thereof + :type cpm_expr: Expression or list of Expression + + :return: self + """ + + # add new user vars to the set + get_variables(cpm_expr_orig, collect=self.user_vars) + + # transform and post the constraints + for cpm_expr in self.transform(cpm_expr_orig): + + # Comparisons: only numeric ones as 'only_bv_implies()' has removed the '==' reification for Boolean expressions + # numexpr `comp` bvar|const + if isinstance(cpm_expr, Comparison): + lhs, rhs = cpm_expr.args + sciprhs = self.solver_var(rhs) + + # Thanks to `only_numexpr_equality()` only supported comparisons should remain + if cpm_expr.name == '<=': + if (isinstance(lhs, Operator) and lhs.name == "sum" and all(a.is_bool() and not isinstance(a, NegBoolView) for a in lhs.args)): + if rhs == 1: # special SOS1 constraint? + self.scip_model.addConsSOS1(self.solver_vars(lhs.args), name=self._get_constraint_name()) + else: # cardinality constraint + self.scip_model.addConsCardinality(self.solver_vars(lhs.args), rhs, name=self._get_constraint_name()) + else: + sciplhs = self._make_numexpr(lhs) + self.scip_model.addCons(sciplhs <= sciprhs, name=self._get_constraint_name()) + + elif cpm_expr.name == '>=': + sciplhs = self._make_numexpr(lhs) + self.scip_model.addCons(sciplhs >= sciprhs, name=self._get_constraint_name()) + elif cpm_expr.name == '==': + if isinstance(lhs, _NumVarImpl) \ + or (isinstance(lhs, Operator) and (lhs.name == 'sum' or lhs.name == 'wsum' or lhs.name == "sub")): + # a BoundedLinearExpression LHS, special case, like in objective + sciplhs = self._make_numexpr(lhs) + self.scip_model.addCons(sciplhs == sciprhs, name=self._get_constraint_name()) + + elif lhs.name == 'mul': + scp_vars = self.solver_vars(lhs.args) + scp_lhs = scp_vars[0] * scp_vars[1] + for v in scp_vars[2:]: + scp_lhs *= v + self.scip_model.addCons(scp_lhs == sciprhs, name=self._get_constraint_name()) + + elif lhs.name == 'div': + a, b = self.solver_vars(lhs.args) + self.scip_model.addCons(a / b == sciprhs, name=self._get_constraint_name()) + + else: + raise NotImplementedError( + "Not a known supported scip comparison '{}' {}".format(lhs.name, cpm_expr)) + + # SCIP does have 'addConsAnd', 'addConsOr', 'addConsXor', 'addConsSOS2' #TODO? + else: + raise NotImplementedError( + "Not a known supported scip comparison '{}' {}".format(lhs.name, cpm_expr)) + + elif isinstance(cpm_expr, Operator) and cpm_expr.name == "->": + # Indicator constraints + # Takes form bvar -> sum(x,y,z) >= rvar + cond, sub_expr = cpm_expr.args + assert isinstance(cond, cp.boolvar), f"Implication constraint {cpm_expr} must have BoolVar as lhs" + assert isinstance(sub_expr, Comparison), "Implication must have linear constraints on right hand side" + + lhs, rhs = sub_expr.args + assert isinstance(lhs, _NumVarImpl) or lhs.name == "sum" or lhs.name == "wsum", f"Unknown linear expression {lhs} on right side of indicator constraint: {cpm_expr}" + assert is_num(rhs), f"linearize should only leave constants on rhs of comparison but got {rhs}" + + if sub_expr.name == ">=": # change sign + if lhs.name == "sum": + lhs = Operator("wsum", [[-1] * len(lhs.args), lhs.args]) + elif lhs.name == "wsum": + lhs = Operator("wsum", [[-w for w in lhs.args[0]], lhs.args[1]]) + else: + lhs = Operator("wsum",[[-1], [lhs]]) + sub_expr = lhs <= -rhs + + if sub_expr.name == "<=": + lhs, rhs = sub_expr.args + lin_expr = self._make_numexpr(lhs) + if isinstance(cond, NegBoolView): + self.scip_model.addConsIndicator(lin_expr <= rhs, name=self._get_constraint_name(), + binvar=self.solver_var(cond._bv), activeone=False) + else: + self.scip_model.addConsIndicator(lin_expr <= rhs, name=self._get_constraint_name(), + binvar=self.solver_var(cond), activeone=True) + + elif sub_expr.name == "==": # split into <= and >= + # TODO: refactor to avoid re-transforming constraints? + self += [cond.implies(lhs <= rhs), cond.implies(lhs >= rhs)] + else: + raise Exception(f"Unknown linear expression {sub_expr} name") + + # True or False + elif isinstance(cpm_expr, BoolVal): + # not sure how else to do it + if cpm_expr.args[0] is False: + bv = self.solver_var(cp.boolvar()) + self.scip_model.addCons(bv <= -1, name=self._get_constraint_name()) + + # a direct constraint, pass to solver + elif isinstance(cpm_expr, DirectConstraint): + cpm_expr.callSolver(self, self.scip_model) + + else: + raise NotImplementedError(cpm_expr) # if you reach this... please report on github + + return self + __add__ = add + + +def _to_writer(model: cp.Model, problem_name: Optional[str] = None) -> _SCIPWriter: + """ + Convert a CPMpy model to a SCIP writer + """ + writer = _SCIPWriter(problem_name=problem_name) + # 1) post constraints + for constraint in model.constraints: + writer += constraint + # 2) post objective + if not model.has_objective(): + raise ValueError("Model has no objective function") + writer.objective(model.objective_, model.objective_is_min) + return writer + + +def to_scip(model: cp.Model) -> "pyscipopt.Model": + """ + Convert a CPMpy model to a SCIP model + + Arguments: + model: CPMpy model + + Returns: + pyscipopt.Model: SCIP model + """ + writer = _to_writer(model) + return writer.scip_model + + +def _add_header(fname: os.PathLike, format: str, header: Optional[str] = None): + """ + Add a header to a file. + + Arguments: + fname: The path to the file to add the header to. + format: The format of the file. + header: The header to add. + """ + + with open(fname, "r") as f: + lines = f.readlines() + + if format == "mps": + header = ["* " + line + "\n" for line in header.splitlines()] + lines = header + lines + + elif format == "lp": + header = ["\\ " + line + "\n" for line in header.splitlines()] + lines = header + lines + + elif format == "cip": + header = ["# " + line + "\n" for line in header.splitlines()] + lines = header + lines + + elif format == "fzn": + header = ["% " + line + "\n" for line in header.splitlines()] + lines = header + lines + + elif format == "gms": + header = ["* " + line + "\n" for line in header.splitlines()] + lines = [lines[0]] + header + lines[1:] # handle first line: $OFFLISTING + + elif format == "pip": + header = ["\\ " + line + "\n" for line in header.splitlines()] + lines = header + lines + + with open(fname, "w") as f: + f.writelines(lines) + + +def write_scip(model: cp.Model, fname: Optional[str] = None, format: str = "mps", header: Optional[str] = None, verbose: bool = False) -> str: + """ + Write a CPMpy model to file using a SCIP provided writer. + Supported formats include: + - "mps" + - "lp" + - "cip" + - "fzn" + - "gms" + - "pip" + + More formats can be supported upon the installation of additional dependencies (like SIMPL). + For more information, see the SCIP documentation: https://pyscipopt.readthedocs.io/en/latest/tutorials/readwrite.html + """ + + writer = _to_writer(model, problem_name="CPMpy Model") + + # Decide where to write + if fname is None: + with tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False) as tmp: + fname = tmp.name + try: + writer.scip_model.writeProblem(fname) + _add_header(fname, format, header) + with open(fname, "r") as f: + return f.read() + finally: + os.remove(fname) + else: + if not verbose: writer.scip_model.hideOutput() + writer.scip_model.writeProblem(fname, verbose=verbose) + if not verbose: writer.scip_model.hideOutput(quiet=False) + _add_header(fname, format, header) + with open(fname, "r") as f: + return f.read() + + From 7138a20b451f958bbbfa9821be6f9469906049e1 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 10:42:42 +0100 Subject: [PATCH 067/152] Add dataset name as metadata --- cpmpy/tools/dataset/_base.py | 2 +- cpmpy/tools/dataset/model/miplib.py | 1 + cpmpy/tools/dataset/model/mse.py | 2 ++ cpmpy/tools/dataset/model/xcsp3.py | 2 ++ 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index 496780b2d..ded32412a 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -71,6 +71,7 @@ def open(self, instance) -> callable: def metadata(self, file) -> dict: metadata = self.category() | { + 'dataset': self.name, 'name': pathlib.Path(file).stem.replace(self.extension, ''), 'path': file, } @@ -102,7 +103,6 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: return filename, metadata - diff --git a/cpmpy/tools/dataset/model/miplib.py b/cpmpy/tools/dataset/model/miplib.py index c5a169cf4..166596ae7 100644 --- a/cpmpy/tools/dataset/model/miplib.py +++ b/cpmpy/tools/dataset/model/miplib.py @@ -15,6 +15,7 @@ class MIPLibDataset(_Dataset): # torch.utils.data.Dataset compatible + name = "miplib" def __init__( self, diff --git a/cpmpy/tools/dataset/model/mse.py b/cpmpy/tools/dataset/model/mse.py index 3ddfebf35..b498d09a9 100644 --- a/cpmpy/tools/dataset/model/mse.py +++ b/cpmpy/tools/dataset/model/mse.py @@ -26,6 +26,8 @@ class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible More information on the competition can be found here: https://maxsat-evaluations.github.io/ """ + + name = "mse" def __init__( self, diff --git a/cpmpy/tools/dataset/model/xcsp3.py b/cpmpy/tools/dataset/model/xcsp3.py index f17a4d193..042c8a9b2 100644 --- a/cpmpy/tools/dataset/model/xcsp3.py +++ b/cpmpy/tools/dataset/model/xcsp3.py @@ -28,6 +28,8 @@ class XCSP3Dataset(_Dataset): More information on the competition can be found here: https://xcsp.org/competitions/ """ + name = "xcsp3" + def __init__( self, root: str = ".", From f5d40c555d3346c3e701e72290676021f0ef0fe2 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 10:44:34 +0100 Subject: [PATCH 068/152] Setup add io deps --- setup.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/setup.py b/setup.py index a4f5fe313..bb1b96f37 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,12 @@ def get_version(rel_path): } solver_dependencies["all"] = list({pkg for group in solver_dependencies.values() for pkg in group}) +format_dependencies = { + "io.mps": ["pyscipopt"], + "io.scip": ["pyscipopt"], + "io.dimacs": ["pyscipopt"], +} + setup( name='cpmpy', version=get_version("cpmpy/__init__.py"), @@ -58,6 +64,7 @@ def get_version(rel_path): extras_require={ # Solvers **solver_dependencies, + **format_dependencies, # Tools "xcsp3": ["pycsp3", "requests", "tqdm", "matplotlib", "psutil", "filelock", "gnureadline; platform_system != 'Windows'", "pyreadline3; platform_system == 'Windows'"], # didn't add CLI-specific req since some are not cross-platform # Other From b3aaf704ad8349ecb2132c41e569c8c1e18f0f92 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 10:52:57 +0100 Subject: [PATCH 069/152] generic IO module --- cpmpy/tools/io/__init__.py | 3 + cpmpy/tools/io/reader.py | 120 ++++++++++++++++++++++++++++++++++++ cpmpy/tools/io/utils.py | 33 ++++++++++ cpmpy/tools/io/writer.py | 121 +++++++++++++++++++++++++++++++++++++ 4 files changed, 277 insertions(+) create mode 100644 cpmpy/tools/io/__init__.py create mode 100644 cpmpy/tools/io/reader.py create mode 100644 cpmpy/tools/io/utils.py create mode 100644 cpmpy/tools/io/writer.py diff --git a/cpmpy/tools/io/__init__.py b/cpmpy/tools/io/__init__.py new file mode 100644 index 000000000..1c42d1861 --- /dev/null +++ b/cpmpy/tools/io/__init__.py @@ -0,0 +1,3 @@ +from .writer import write, write_formats +from .reader import read, read_formats +from .utils import get_extension, get_format diff --git a/cpmpy/tools/io/reader.py b/cpmpy/tools/io/reader.py new file mode 100644 index 000000000..52631a6e4 --- /dev/null +++ b/cpmpy/tools/io/reader.py @@ -0,0 +1,120 @@ +""" +CPMpy tools for reading models from files. + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + read + read_formats + +============== +Module details +============== +""" + +from typing import Callable, List, Optional + +import cpmpy as cp +from cpmpy.tools.scip.parser import read_scip +from cpmpy.tools.dimacs import read_dimacs +from cpmpy.tools.io.utils import get_format + +# mapping format names to appropriate reader functions +_reader_map = { + "mps": read_scip, + "lp": read_scip, + "cip": read_scip, + "fzn": read_scip, + "gms": read_scip, + "pip": read_scip, + "dimacs": read_dimacs, +} + + +def _get_reader(format: str) -> Callable[[str], cp.Model]: + """ + Get the reader function for a given format. + + Arguments: + format (str): The name of the format to get a reader for. + + Raises: + ValueError: If the format is not supported. + + Returns: + A callable that reads a model from a file. + """ + + if format not in _reader_map: + raise ValueError(f"Unsupported format: {format}") + + return _reader_map[format] + +def read_formats() -> List[str]: + """ + List of supported read formats. + + Each can be used as the `format` argument to the `read` function. + E.g.: + + .. code-block:: python + + from cpmpy.tools.io import read + model = read(file_path, format="mps") + model = read(file_path, format="lp") + """ + return list(_reader_map.keys()) + +def _derive_format(file_path: str) -> str: + """ + Derive the format of a file from its path. + + Arguments: + file_path (str): The path to the file to derive the format from. + + Raises: + ValueError: If the format could not be derived from the file path. + + Returns: + The name of the format. + + Example: + >>> _derive_format("instance.mps") + "mps" + >>> _derive_format("instance.lp.xz") + "lp" + """ + + # Iterate over the file path extensions in reverse order + for ext in file_path.split(".")[::-1]: + try: + return get_format(ext) + except ValueError: + continue + + raise ValueError(f"No file format provided and could not derive format from file path: {file_path}") + +def read(file_path: str, format: Optional[str] = None) -> cp.Model: + """ + Read a model from a file. + + Arguments: + file_path (str): The path to the file to read. + format (Optional[str]): The format of the file to read. If None, the format will be derived from the file path. + + Raises: + ValueError: If the format is not supported. + + Returns: + A CPMpy model. + """ + + if format is None: + format = _derive_format(file_path) + + reader = _get_reader(format) + return reader(file_path) \ No newline at end of file diff --git a/cpmpy/tools/io/utils.py b/cpmpy/tools/io/utils.py new file mode 100644 index 000000000..a31ad8d98 --- /dev/null +++ b/cpmpy/tools/io/utils.py @@ -0,0 +1,33 @@ +import warnings + + +# mapping file extensions to appropriate format names +_format_map = { + "mps" : "mps", + "lp" : "lp", + "cip" : "cip", + "fzn" : "fzn", + "gms" : "gms", + "pip" : "pip", + "wcnf" : "dimacs", + "cnf" : "dimacs", +} + +_extension_map = {} +for extension, format in _format_map.items(): + _extension_map[format] = _extension_map.get(format, []) + [extension] + +def get_extension(format: str) -> str: + """ + Get the file extension for a given format. + """ + if len(_extension_map[format]) > 1: + warnings.warn(f"Multiple extensions found for format {format}: {_extension_map[format]}. Using the first one: {_extension_map[format][0]}") + + return _extension_map[format][0] + +def get_format(extension: str) -> str: + """ + Get the format for a given file extension. + """ + return _format_map[extension] \ No newline at end of file diff --git a/cpmpy/tools/io/writer.py b/cpmpy/tools/io/writer.py new file mode 100644 index 000000000..aebcb2d8f --- /dev/null +++ b/cpmpy/tools/io/writer.py @@ -0,0 +1,121 @@ +""" +CPMpy tools for writing models to files. + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + write + write_formats + +============== +Module details +============== +""" + +import inspect +from typing import Callable, Optional, List +from functools import partial + +import cpmpy as cp +from cpmpy.tools.scip.parser import write_scip +from cpmpy.tools.dimacs import write_dimacs + +# mapping format names to appropriate writer functions +_writer_map = { + "mps": partial(write_scip, format="mps"), + "lp": partial(write_scip, format="lp"), + "cip": partial(write_scip, format="cip"), + # "cnf": partial(write_scip, format="cnf"), # requires SIMPL, not included in pip package + # "diff": partial(write_scip, format="diff"), # requires SIMPL, not included in pip package + "fzn": partial(write_scip, format="fzn"), + "gms": partial(write_scip, format="gms"), + # "opb": partial(write_scip, format="opb"), # requires SIMPL, not included in pip package + # "osil": partial(write_scip, format="osil"), + "pip": partial(write_scip, format="pip"), + # "sol": partial(write_scip, format="sol"), # requires SIMPL, not included in pip package + # "wbo": partial(write_scip, format="wbo"), # requires SIMPL, not included in pip package + # "zpl": partial(write_scip, format="zpl"), # requires SIMPL, not included in pip package + "dimacs": write_dimacs, +} + +def _get_writer(format: str) -> Callable: + """ + Get the writer function for a given format. + + Arguments: + format (str): The name of the format to get a writer for. + + Raises: + ValueError: If the format is not supported. + + Returns: + A callable that writes a model to a file. + """ + + if format not in _writer_map: + raise ValueError(f"Unsupported format: {format}") + + return _writer_map[format] + +def write_formats() -> List[str]: + """ + List of supported write formats. + + Each can be used as the `format` argument to the `write` function. + E.g.: + + .. code-block:: python + + from cpmpy.tools.io import write, write_formats, get_extension + write(model, format=write_formats()[0]) + write(model, format=write_formats()[1], file_path=f"model.{get_extension(write_formats()[1])}") + """ + return list(_writer_map.keys()) + +def _create_header(format: str) -> str: + """ + Default header for a file. + """ + header = "-"*100 + "\n" + header += "File written by CPMpy\n" + header += f" Format: '{format}'\n" + header += f" CPMpy Version: {cp.__version__}\n" + header += "-"*100 + "\n" + return header + +def write(model: cp.Model, format: str, file_path: Optional[str] = None, verbose: bool = False, header: Optional[str] = None, **kwargs) -> str: + """ + Write a model to a file. + + Arguments: + model (cp.Model): The model to write. + format (str): The format to write the model in. + file_path (Optional[str]): The path to the file to write the model to. If None, only a string containing the model will be returned. + verbose (bool): Whether to print verbose output. + header (Optional[str]): The header to put at the top of the file. If None, a default header will be created. Pass an empty string to skip adding a header. + **kwargs: Additional arguments to pass to the writer. + """ + + writer = _get_writer(format) + + kwargs["verbose"] = verbose + + # keep only kwargs the writer accepts + sig = inspect.signature(writer) + allowed = sig.parameters + filtered_kwargs = { + k: v for k, v in kwargs.items() + if k in allowed + } + + # create header if not provided + if header is None: + header = _create_header(format) + if header == "": + header = None + + return writer(model, fname=file_path, header=header, **filtered_kwargs) \ No newline at end of file From cb10ab8cb16b485aee3d2eae6d32a2873e3adef4 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 10:54:41 +0100 Subject: [PATCH 070/152] Dimacs add header support --- cpmpy/tools/dimacs.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpmpy/tools/dimacs.py b/cpmpy/tools/dimacs.py index 19ab8d444..93109a5b1 100644 --- a/cpmpy/tools/dimacs.py +++ b/cpmpy/tools/dimacs.py @@ -23,9 +23,9 @@ from cpmpy.transformations.get_variables import get_variables import re +from typing import Optional - -def write_dimacs(model, fname=None, encoding="auto"): +def write_dimacs(model, fname=None, encoding="auto", header:Optional[str]="DIMACS file written by CPMpy"): """ Writes CPMpy model to DIMACS format Uses the "to_cnf" transformation from CPMpy @@ -65,6 +65,10 @@ def write_dimacs(model, fname=None, encoding="auto"): out += " ".join(ints + ["0"]) + "\n" + if header is not None: + header_lines = ["c " + line for line in header.splitlines()] + out = "\n".join(header_lines) + "\n" + out + if fname is not None: with open(fname, "w") as f: f.write(out) From b4be3aadadb06da031ea1d820fb9361eda3fe85e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 10:54:59 +0100 Subject: [PATCH 071/152] Dimacs check for objective --- cpmpy/tools/dimacs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpmpy/tools/dimacs.py b/cpmpy/tools/dimacs.py index 93109a5b1..42197cc22 100644 --- a/cpmpy/tools/dimacs.py +++ b/cpmpy/tools/dimacs.py @@ -38,6 +38,9 @@ def write_dimacs(model, fname=None, encoding="auto", header:Optional[str]="DIMAC :param encoding: the encoding used for `int2bool`, choose from ("auto", "direct", "order", or "binary") """ + if model.has_objective(): + raise ValueError("DIMACS format does not support objective functions") + constraints = toplevel_list(model.constraints) constraints = to_cnf(constraints, encoding=encoding) From dc34e6eef1629b36e2e66840d61f643020094fb1 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 11:38:02 +0100 Subject: [PATCH 072/152] Move IO tools --- .../tools/{jsplib/parser.py => io/jsplib.py} | 0 cpmpy/tools/{mps/parser.py => io/mps.py} | 0 .../parser.py => io/nurserostering.py} | 0 cpmpy/tools/{opb/parser.py => io/opb.py} | 0 cpmpy/tools/{rcpsp/parser.py => io/rcpsp.py} | 0 cpmpy/tools/{scip/parser.py => io/scip.py} | 0 cpmpy/tools/{wcnf/parser.py => io/wcnf.py} | 0 cpmpy/tools/jsplib/__init__.py | 20 ------------------ cpmpy/tools/mps/__init__.py | 21 ------------------- cpmpy/tools/nurserostering/__init__.py | 21 ------------------- cpmpy/tools/opb/__init__.py | 21 ------------------- cpmpy/tools/rcpsp/__init__.py | 20 ------------------ cpmpy/tools/wcnf/__init__.py | 20 ------------------ 13 files changed, 123 deletions(-) rename cpmpy/tools/{jsplib/parser.py => io/jsplib.py} (100%) rename cpmpy/tools/{mps/parser.py => io/mps.py} (100%) rename cpmpy/tools/{nurserostering/parser.py => io/nurserostering.py} (100%) rename cpmpy/tools/{opb/parser.py => io/opb.py} (100%) rename cpmpy/tools/{rcpsp/parser.py => io/rcpsp.py} (100%) rename cpmpy/tools/{scip/parser.py => io/scip.py} (100%) rename cpmpy/tools/{wcnf/parser.py => io/wcnf.py} (100%) delete mode 100644 cpmpy/tools/jsplib/__init__.py delete mode 100644 cpmpy/tools/mps/__init__.py delete mode 100644 cpmpy/tools/nurserostering/__init__.py delete mode 100644 cpmpy/tools/opb/__init__.py delete mode 100644 cpmpy/tools/rcpsp/__init__.py delete mode 100644 cpmpy/tools/wcnf/__init__.py diff --git a/cpmpy/tools/jsplib/parser.py b/cpmpy/tools/io/jsplib.py similarity index 100% rename from cpmpy/tools/jsplib/parser.py rename to cpmpy/tools/io/jsplib.py diff --git a/cpmpy/tools/mps/parser.py b/cpmpy/tools/io/mps.py similarity index 100% rename from cpmpy/tools/mps/parser.py rename to cpmpy/tools/io/mps.py diff --git a/cpmpy/tools/nurserostering/parser.py b/cpmpy/tools/io/nurserostering.py similarity index 100% rename from cpmpy/tools/nurserostering/parser.py rename to cpmpy/tools/io/nurserostering.py diff --git a/cpmpy/tools/opb/parser.py b/cpmpy/tools/io/opb.py similarity index 100% rename from cpmpy/tools/opb/parser.py rename to cpmpy/tools/io/opb.py diff --git a/cpmpy/tools/rcpsp/parser.py b/cpmpy/tools/io/rcpsp.py similarity index 100% rename from cpmpy/tools/rcpsp/parser.py rename to cpmpy/tools/io/rcpsp.py diff --git a/cpmpy/tools/scip/parser.py b/cpmpy/tools/io/scip.py similarity index 100% rename from cpmpy/tools/scip/parser.py rename to cpmpy/tools/io/scip.py diff --git a/cpmpy/tools/wcnf/parser.py b/cpmpy/tools/io/wcnf.py similarity index 100% rename from cpmpy/tools/wcnf/parser.py rename to cpmpy/tools/io/wcnf.py diff --git a/cpmpy/tools/jsplib/__init__.py b/cpmpy/tools/jsplib/__init__.py deleted file mode 100644 index 6ebdec377..000000000 --- a/cpmpy/tools/jsplib/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## __init__.py -## -""" -Set of utilities for working with JSPLib-formatted CP models. - - -================== -List of submodules -================== - -.. autosummary:: - :nosignatures: - - parser -""" - -from .parser import read_jsplib diff --git a/cpmpy/tools/mps/__init__.py b/cpmpy/tools/mps/__init__.py deleted file mode 100644 index 540863b84..000000000 --- a/cpmpy/tools/mps/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## __init__.py -## -""" -Set of utilities for working with MPS-formatted LP/MIP models. - - -================== -List of submodules -================== - -.. autosummary:: - :nosignatures: - - parser -""" - -from .parser import read_mps -from .parser import write_mps \ No newline at end of file diff --git a/cpmpy/tools/nurserostering/__init__.py b/cpmpy/tools/nurserostering/__init__.py deleted file mode 100644 index b41d8f604..000000000 --- a/cpmpy/tools/nurserostering/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## __init__.py -## -""" -Set of utilities for working with Nurse Rostering-formatted CP models. - - -================== -List of submodules -================== - -.. autosummary:: - :nosignatures: - - parser -""" - -from .parser import read_nurserostering - diff --git a/cpmpy/tools/opb/__init__.py b/cpmpy/tools/opb/__init__.py deleted file mode 100644 index ae751c7e7..000000000 --- a/cpmpy/tools/opb/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## __init__.py -## -""" -Set of utilities for working with OPB-formatted CP models. - -Currently only the restricted OPB PB24 format is supported (without WBO). - -================== -List of submodules -================== - -.. autosummary:: - :nosignatures: - - parser -""" - -from .parser import read_opb diff --git a/cpmpy/tools/rcpsp/__init__.py b/cpmpy/tools/rcpsp/__init__.py deleted file mode 100644 index b24d99980..000000000 --- a/cpmpy/tools/rcpsp/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## __init__.py -## -""" -Set of utilities for working with psplib-formatted rcpsp CP models. - - -================== -List of submodules -================== - -.. autosummary:: - :nosignatures: - - parser -""" - -from .parser import read_rcpsp diff --git a/cpmpy/tools/wcnf/__init__.py b/cpmpy/tools/wcnf/__init__.py deleted file mode 100644 index e2db10412..000000000 --- a/cpmpy/tools/wcnf/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- -## -## __init__.py -## -""" -Set of utilities for working with WCNF-formatted CP models. - - -================== -List of submodules -================== - -.. autosummary:: - :nosignatures: - - parser -""" - -from .parser import read_wcnf From ae75200620384f1de2ddab679cb31002ee2966de Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 11:41:00 +0100 Subject: [PATCH 073/152] add tools to IO module --- cpmpy/tools/io/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpmpy/tools/io/__init__.py b/cpmpy/tools/io/__init__.py index 1c42d1861..e1d1fbd42 100644 --- a/cpmpy/tools/io/__init__.py +++ b/cpmpy/tools/io/__init__.py @@ -1,3 +1,11 @@ from .writer import write, write_formats from .reader import read, read_formats from .utils import get_extension, get_format + +from .jsplib import read_jsplib +from .mps import read_mps, write_mps +from .nurserostering import read_nurserostering +from .opb import read_opb +from .rcpsp import read_rcpsp +from .scip import read_scip, write_scip +from .wcnf import read_wcnf \ No newline at end of file From df93d356f02fd1c806324c05f74441ea1e28f925 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 11:43:40 +0100 Subject: [PATCH 074/152] Add comment --- cpmpy/tools/io/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/io/__init__.py b/cpmpy/tools/io/__init__.py index e1d1fbd42..f4e9de391 100644 --- a/cpmpy/tools/io/__init__.py +++ b/cpmpy/tools/io/__init__.py @@ -3,7 +3,10 @@ from .utils import get_extension, get_format from .jsplib import read_jsplib -from .mps import read_mps, write_mps +# TODO: this tool is just a wrapper around read_scip and write_scip, +# do we want such a wrapper for each format scip provides? +# You can already use the generic `read()` and `write()` to read and write any format scip provides. +from .mps import read_mps, write_mps from .nurserostering import read_nurserostering from .opb import read_opb from .rcpsp import read_rcpsp From d6e3f9726874f43ddd36ad6c8c9f7194125a3c68 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 11:45:24 +0100 Subject: [PATCH 075/152] Add docstring --- cpmpy/tools/io/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpmpy/tools/io/__init__.py b/cpmpy/tools/io/__init__.py index f4e9de391..a79a2d3ea 100644 --- a/cpmpy/tools/io/__init__.py +++ b/cpmpy/tools/io/__init__.py @@ -1,3 +1,11 @@ +""" +IO tools for CPMpy. + +This module provides tools to read and write models in various formats. +Use the generic `read(..., format="...")` and `write(..., format="...")` functions to read and write +models in one of the supported formats. +""" + from .writer import write, write_formats from .reader import read, read_formats from .utils import get_extension, get_format From 06cf73676d117abd2416771ee7ff9cc25a40408f Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 11:47:20 +0100 Subject: [PATCH 076/152] Move datasets to shared directory --- cpmpy/tools/dataset/{problem => }/jsplib.py | 0 cpmpy/tools/dataset/{model => }/miplib.py | 0 cpmpy/tools/dataset/{model => }/mse.py | 0 cpmpy/tools/dataset/{problem => }/nurserostering.py | 0 cpmpy/tools/dataset/{model => }/opb.py | 0 cpmpy/tools/dataset/{problem => }/psplib.py | 0 cpmpy/tools/dataset/{model => }/xcsp3.py | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename cpmpy/tools/dataset/{problem => }/jsplib.py (100%) rename cpmpy/tools/dataset/{model => }/miplib.py (100%) rename cpmpy/tools/dataset/{model => }/mse.py (100%) rename cpmpy/tools/dataset/{problem => }/nurserostering.py (100%) rename cpmpy/tools/dataset/{model => }/opb.py (100%) rename cpmpy/tools/dataset/{problem => }/psplib.py (100%) rename cpmpy/tools/dataset/{model => }/xcsp3.py (100%) diff --git a/cpmpy/tools/dataset/problem/jsplib.py b/cpmpy/tools/dataset/jsplib.py similarity index 100% rename from cpmpy/tools/dataset/problem/jsplib.py rename to cpmpy/tools/dataset/jsplib.py diff --git a/cpmpy/tools/dataset/model/miplib.py b/cpmpy/tools/dataset/miplib.py similarity index 100% rename from cpmpy/tools/dataset/model/miplib.py rename to cpmpy/tools/dataset/miplib.py diff --git a/cpmpy/tools/dataset/model/mse.py b/cpmpy/tools/dataset/mse.py similarity index 100% rename from cpmpy/tools/dataset/model/mse.py rename to cpmpy/tools/dataset/mse.py diff --git a/cpmpy/tools/dataset/problem/nurserostering.py b/cpmpy/tools/dataset/nurserostering.py similarity index 100% rename from cpmpy/tools/dataset/problem/nurserostering.py rename to cpmpy/tools/dataset/nurserostering.py diff --git a/cpmpy/tools/dataset/model/opb.py b/cpmpy/tools/dataset/opb.py similarity index 100% rename from cpmpy/tools/dataset/model/opb.py rename to cpmpy/tools/dataset/opb.py diff --git a/cpmpy/tools/dataset/problem/psplib.py b/cpmpy/tools/dataset/psplib.py similarity index 100% rename from cpmpy/tools/dataset/problem/psplib.py rename to cpmpy/tools/dataset/psplib.py diff --git a/cpmpy/tools/dataset/model/xcsp3.py b/cpmpy/tools/dataset/xcsp3.py similarity index 100% rename from cpmpy/tools/dataset/model/xcsp3.py rename to cpmpy/tools/dataset/xcsp3.py From 4c54114d14836e34e31aabb58650dfe6cbd5dc7c Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 11:50:41 +0100 Subject: [PATCH 077/152] convert JSPLibDataset to _Dataset subclass --- cpmpy/tools/dataset/jsplib.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/dataset/jsplib.py b/cpmpy/tools/dataset/jsplib.py index ea3e88341..964b64d93 100644 --- a/cpmpy/tools/dataset/jsplib.py +++ b/cpmpy/tools/dataset/jsplib.py @@ -17,8 +17,11 @@ import numpy as np import cpmpy as cp +from cpmpy.tools.dataset._base import _Dataset -class JSPLibDataset(object): # torch.utils.data.Dataset compatible +class JSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible + + name = "jsplib" """ JSP Dataset in a PyTorch compatible format. From a6017114bb33fe2f0caeb1dfb011a5d7459d7300 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 12:47:38 +0100 Subject: [PATCH 078/152] support nested files --- cpmpy/tools/dataset/_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index ded32412a..933373548 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -36,10 +36,10 @@ def __init__( raise ValueError(f"Dataset not found. Please set download=True to download the dataset.") else: self.download() - files = sorted(list(self.dataset_dir.glob(f"*{self.extension}"))) + files = sorted(list(self.dataset_dir.rglob(f"*{self.extension}"))) print(f"Finished downloading {len(files)} instances") - files = sorted(list(self.dataset_dir.glob(f"*{self.extension}"))) + files = sorted(list(self.dataset_dir.rglob(f"*{self.extension}"))) if len(files) == 0: raise ValueError("Cannot find any instances inside dataset. Is it a valid dataset? If so, please report on GitHub.") @@ -79,7 +79,7 @@ def metadata(self, file) -> dict: def __len__(self) -> int: """Return the total number of instances.""" - return len(list(self.dataset_dir.glob(f"*{self.extension}"))) + return len(list(self.dataset_dir.rglob(f"*{self.extension}"))) def __getitem__(self, index: int) -> Tuple[Any, Any]: @@ -88,7 +88,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: raise IndexError("Index out of range") # Get all compressed XML files and sort for deterministic behavior - files = sorted(list(self.dataset_dir.glob(f"*{self.extension}"))) + files = sorted(list(self.dataset_dir.rglob(f"*{self.extension}"))) file_path = files[index] filename = str(file_path) From 2240e0ca5311277fc7953ce10330ab1f303a076d Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 12:48:10 +0100 Subject: [PATCH 079/152] opb nested files and competition filter --- cpmpy/tools/dataset/opb.py | 51 +++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/cpmpy/tools/dataset/opb.py b/cpmpy/tools/dataset/opb.py index 201075749..43a8dcfa8 100644 --- a/cpmpy/tools/dataset/opb.py +++ b/cpmpy/tools/dataset/opb.py @@ -4,6 +4,7 @@ https://www.cril.univ-artois.fr/PB25/ """ +import fnmatch import lzma import os import pathlib @@ -11,7 +12,7 @@ from urllib.error import HTTPError, URLError import tarfile -from .._base import _Dataset +from cpmpy.tools.dataset._base import _Dataset class OPBDataset(_Dataset): @@ -27,10 +28,13 @@ class OPBDataset(_Dataset): More information on the competition can be found here: https://www.cril.univ-artois.fr/PB25/ """ + name = "opb" + def __init__( self, root: str = ".", year: int = 2024, track: str = "OPT-LIN", + competition: bool = False, transform=None, target_transform=None, download: bool = False ): @@ -41,6 +45,7 @@ def __init__( root (str): Root directory where datasets are stored or will be downloaded to (default="."). year (int): Competition year of the dataset to use (default=2024). track (str): Track name specifying which subset of the competition instances to load (default="OPT-LIN"). + competition (bool): If True, the dataset will filtered on competition-used instances. transform (callable, optional): Optional transform applied to the instance file path. target_transform (callable, optional): Optional transform applied to the metadata dictionary. download (bool): If True, downloads the dataset if it does not exist locally (default=False). @@ -54,6 +59,7 @@ def __init__( self.root = pathlib.Path(root) self.year = year self.track = track + self.competition = competition # Check requested dataset if not str(year).startswith('20'): @@ -61,7 +67,7 @@ def __init__( if not track: raise ValueError("Track must be specified, e.g. exact-weighted, exact-unweighted, ...") - dataset_dir = self.root / str(year) / track + dataset_dir = self.root / str(year) / track / ('selected' if self.competition else 'normalized') super().__init__( dataset_dir=dataset_dir, @@ -82,12 +88,12 @@ def metadata(self, file) -> dict: def download(self): # TODO: add option to filter on competition instances - print(f"Downloading OPB {self.year} {self.track} instances...") + print(f"Downloading OPB {self.year} {self.track} {'competition' if self.competition else 'non-competition'} instances...") url = f"https://www.cril.univ-artois.fr/PB24/benchs/" year_suffix = str(self.year)[2:] # Drop the starting '20' - url_path = url + f"normalized-PB{year_suffix}.tar" - tar_path = self.root / f"normalized-extraPB{year_suffix}.tar" + url_path = url + f"{'normalized' if not self.competition else 'selected'}-PB{year_suffix}.tar" + tar_path = self.root / f"{'normalized' if not self.competition else 'selected'}-PB{year_suffix}.tar" try: urlretrieve(url_path, str(tar_path)) @@ -108,11 +114,18 @@ def download(self): # Extract only files from the specified track # Get all unique track names from tar - tracks = set() - for member in tar_ref.getmembers(): - parts = member.name.split("/") - if len(parts) > 2 and parts[0] == main_folder: - tracks.add(parts[1]) + if not self.competition: + tracks = set() + for member in tar_ref.getmembers(): + parts = member.name.split("/") + if len(parts) > 2 and parts[0] == main_folder: + tracks.add(parts[1]) + else: + tracks = set() + for member in tar_ref.getmembers(): + parts = member.name.split("/") + if len(parts) > 2 and parts[0] == main_folder: + tracks.add(parts[2]) # Check if requested track exists if self.track not in tracks: @@ -122,16 +135,24 @@ def download(self): self.dataset_dir.mkdir(parents=True, exist_ok=True) # Extract files for the specified track - prefix = f"{main_folder}/{self.track}/" + if not self.competition: + prefix = f"{main_folder}/{self.track}/" + else: + prefix = f"{main_folder}/*/{self.track}/" for member in tar_ref.getmembers(): - if member.name.startswith(prefix) and member.isfile(): + if fnmatch.fnmatch(member.name, prefix + "*") and member.isfile(): # Path relative to main_folder/track - relative_path = member.name[len(prefix):] + # Find where the track folder ends and get everything after + track_marker = f"/{self.track}/" + marker_pos = member.name.find(track_marker) + relative_path = member.name[marker_pos + len(track_marker):] # Flatten: replace "/" with "_" to encode subfolders (some instances have clashing names) - flat_name = relative_path.replace("/", "_") + flat_name = relative_path#.replace("/", "_") target_path = self.dataset_dir / flat_name + os.makedirs(os.path.dirname(target_path), exist_ok=True) + with tar_ref.extractfile(member) as source, open(target_path, "wb") as target: target.write(source.read()) @@ -142,6 +163,6 @@ def open(self, instance: os.PathLike) -> callable: return lzma.open(instance, 'rt') if str(instance).endswith(".xz") else open(instance) if __name__ == "__main__": - dataset = OPBDataset(year=2024, track="DEC-LIN", download=True) + dataset = OPBDataset(year=2024, track="DEC-LIN", competition=True, download=True) print("Dataset size:", len(dataset)) print("Instance 0:", dataset[0]) From 00cd3c9ac4194e62f28a2fcbfef0a4179c4b1afd Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 29 Jan 2026 12:49:20 +0100 Subject: [PATCH 080/152] More _Dataset subclassing --- cpmpy/tools/dataset/jsplib.py | 4 ++-- cpmpy/tools/dataset/nurserostering.py | 8 +++++--- cpmpy/tools/dataset/psplib.py | 5 ++++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cpmpy/tools/dataset/jsplib.py b/cpmpy/tools/dataset/jsplib.py index 964b64d93..876e7fe32 100644 --- a/cpmpy/tools/dataset/jsplib.py +++ b/cpmpy/tools/dataset/jsplib.py @@ -21,13 +21,13 @@ class JSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible - name = "jsplib" - """ JSP Dataset in a PyTorch compatible format. More information on JSPLib can be found here: https://github.com/tamy0612/JSPLIB """ + + name = "jsplib" def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False): """ diff --git a/cpmpy/tools/dataset/nurserostering.py b/cpmpy/tools/dataset/nurserostering.py index 589a373de..6f7932b9a 100644 --- a/cpmpy/tools/dataset/nurserostering.py +++ b/cpmpy/tools/dataset/nurserostering.py @@ -15,7 +15,7 @@ import re import cpmpy as cp - +from cpmpy.tools.dataset._base import _Dataset # Optional dependencies try: import pandas as pd @@ -30,14 +30,16 @@ _HAS_FAKER = False -class NurseRosteringDataset(object): # torch.utils.data.Dataset compatible - +class NurseRosteringDataset(_Dataset): # torch.utils.data.Dataset compatible + """ Nurserostering Dataset in a PyTorch compatible format. More information on nurserostering instances can be found here: https://schedulingbenchmarks.org/nrp/ """ + name = "nurserostering" + def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False, sort_key=None): """ Initialize the Nurserostering Dataset. diff --git a/cpmpy/tools/dataset/psplib.py b/cpmpy/tools/dataset/psplib.py index 89f0e93c7..cf47f7c47 100644 --- a/cpmpy/tools/dataset/psplib.py +++ b/cpmpy/tools/dataset/psplib.py @@ -10,14 +10,17 @@ from urllib.error import HTTPError, URLError import zipfile -class PSPLibDataset(object): # torch.utils.data.Dataset compatible +from cpmpy.tools.dataset._base import _Dataset +class PSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible """ PSPlib Dataset in a PyTorch compatible format. More information on PSPlib can be found here: https://www.om-db.wi.tum.de/psplib/main.html """ + name = "psplib" + def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False): """ Constructor for a dataset object for PSPlib. From e57c5331fd7c4feaf6475e2fb484b7c67ac886d1 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 30 Jan 2026 09:54:58 +0100 Subject: [PATCH 081/152] Fix metadata order --- cpmpy/tools/dataset/_base.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index 933373548..982a35bcb 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -90,17 +90,17 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: # Get all compressed XML files and sort for deterministic behavior files = sorted(list(self.dataset_dir.rglob(f"*{self.extension}"))) file_path = files[index] - filename = str(file_path) - if self.transform: - # does not need to remain a filename... - filename = self.transform(filename) - + # Basic metadata about the instance - metadata = self.metadata(file=filename, ) + metadata = self.metadata(file=filename) if self.target_transform: metadata = self.target_transform(metadata) - + + if self.transform: + # does not need to remain a filename... + filename = self.transform(filename) + return filename, metadata From 132725d25708100f7d05e85de943d02a379653f0 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 30 Jan 2026 10:00:45 +0100 Subject: [PATCH 082/152] re-usable dataset downloader --- cpmpy/tools/dataset/_base.py | 118 +++++++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 4 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index 982a35bcb..6d168a205 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -7,8 +7,26 @@ """ from abc import ABC, abstractmethod +import os import pathlib -from typing import Any, Tuple +import tempfile +from typing import Any, Optional, Tuple +from urllib.error import URLError +from urllib.request import HTTPError, Request, urlopen + +def format_bytes(bytes_num): + """ + Format bytes into human-readable string (e.g., KB, MB, GB). + """ + for unit in ['bytes', 'KB', 'MB', 'GB', 'TB']: + if bytes_num < 1024.0: + return f"{bytes_num:.1f} {unit}" + bytes_num /= 1024.0 + +try: + from tqdm import tqdm +except ImportError: + tqdm = None class _Dataset(ABC): """ @@ -41,7 +59,7 @@ def __init__( files = sorted(list(self.dataset_dir.rglob(f"*{self.extension}"))) if len(files) == 0: - raise ValueError("Cannot find any instances inside dataset. Is it a valid dataset? If so, please report on GitHub.") + raise ValueError(f"Cannot find any instances inside dataset {self.dataset_dir}. Is it a valid dataset? If so, please report on GitHub.") @abstractmethod def category(self) -> dict: @@ -60,14 +78,13 @@ def download(self, *args, **kwargs): """ pass - @abstractmethod def open(self, instance) -> callable: """ How an instance file from the dataset should be opened. Especially usefull when files come compressed and won't work with python standard library's 'open', e.g. '.xz', '.lzma'. """ - pass + return open def metadata(self, file) -> dict: metadata = self.category() | { @@ -102,7 +119,100 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: filename = self.transform(filename) return filename, metadata + + @staticmethod + def _download_file(url: str, target: str, destination: Optional[str] = None, + desc: str = None, + chunk_size: int = 1024 * 1024) -> os.PathLike: + """ + Download a file from a URL with progress bar and speed information. + + This method provides a reusable download function with progress updates + similar to pip and uv, showing download progress, speed, and ETA. + + Arguments: + url (str): The URL to download from. + target (str): The target filename to download. + destination (str, optional): The destination path to save the file. + desc (str, optional): Description to show in the progress bar. + If None, uses the filename. + chunk_size (int): Size of each chunk for download in bytes (default=1MB). + + Returns: + str: The destination path where the downloaded file is saved. + """ + + if desc is None: + desc = target + + if destination is None: + temp_destination = tempfile.NamedTemporaryFile(delete=False) + + try: + req = Request(url + target) + with urlopen(req) as response: + total_size = int(response.headers.get('Content-Length', 0)) + + _Dataset._download_sequential(url + target, destination if destination is not None else temp_destination.name, total_size, desc, chunk_size) + if destination is None: + temp_destination.close() + + return pathlib.Path(destination if destination is not None else temp_destination.name) + + except (HTTPError, URLError) as e: + raise ValueError(f"Failed to download file from {url + target}. Error: {str(e)}") + + @staticmethod + def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc: str, + chunk_size: int = 1024 * 1024): + """Download file sequentially (fallback method).""" + import sys + + req = Request(url) + with urlopen(req) as response: + if tqdm is not None: + if total_size > 0: + with tqdm(total=total_size, unit='B', unit_scale=True, + unit_divisor=1024, desc=desc, file=sys.stdout, + miniters=1, dynamic_ncols=True, ascii=False) as pbar: + with open(filepath, 'wb') as f: + while True: + chunk = response.read(chunk_size) + if not chunk: + break + f.write(chunk) + pbar.update(len(chunk)) + else: + # Unknown size + with tqdm(unit='B', unit_scale=True, unit_divisor=1024, + desc=desc, file=sys.stdout, miniters=1, + dynamic_ncols=True, ascii=False) as pbar: + with open(filepath, 'wb') as f: + while True: + chunk = response.read(chunk_size) + if not chunk: + break + f.write(chunk) + pbar.update(len(chunk)) + else: + # Fallback to simple download if tqdm is not available + downloaded = 0 + with open(filepath, 'wb') as f: + while True: + chunk = response.read(chunk_size) + if not chunk: + break + f.write(chunk) + downloaded += len(chunk) + if total_size > 0: + percent = (downloaded / total_size) * 100 + sys.stdout.write(f"\r\033[KDownloading {desc}: {format_bytes(downloaded)}/{format_bytes(total_size)} ({percent:.1f}%)") + else: + sys.stdout.write(f"\r\033[KDownloading {desc}: {format_bytes(downloaded)}...") + sys.stdout.flush() + sys.stdout.write("\n") + sys.stdout.flush() From d85aeb23049a65a8d8ff67f261ba67c1d0523374 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 30 Jan 2026 10:14:49 +0100 Subject: [PATCH 083/152] nurserostering reuse downloader --- cpmpy/tools/dataset/nurserostering.py | 93 ++++++++++----------------- 1 file changed, 33 insertions(+), 60 deletions(-) diff --git a/cpmpy/tools/dataset/nurserostering.py b/cpmpy/tools/dataset/nurserostering.py index 6f7932b9a..401a13922 100644 --- a/cpmpy/tools/dataset/nurserostering.py +++ b/cpmpy/tools/dataset/nurserostering.py @@ -6,6 +6,7 @@ https://schedulingbenchmarks.org/nrp/ """ + import os import pathlib from typing import Tuple, Any @@ -16,6 +17,7 @@ import cpmpy as cp from cpmpy.tools.dataset._base import _Dataset + # Optional dependencies try: import pandas as pd @@ -55,76 +57,47 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl """ self.root = pathlib.Path(root) - self.instance_dir = pathlib.Path(os.path.join(self.root, "nurserostering")) self.transform = transform self.target_transform = target_transform self.sort_key = sorted if sort_key is None else sort_key - # Create root directory if it doesn't exist - self.root.mkdir(parents=True, exist_ok=True) - - if not self.instance_dir.exists(): - if not download: - raise ValueError(f"Dataset not found in local file system. Please set download=True to download the dataset.") - else: - url = f"https://schedulingbenchmarks.org/nrp/data/instances1_24.zip" # download full repo... - zip_path = pathlib.Path(os.path.join(root,"jsplib-master.zip")) - - print(f"Downloading Nurserostering instances from schedulingbenchmarks.org") - - try: - urlretrieve(url, str(zip_path)) - except (HTTPError, URLError) as e: - raise ValueError(f"No dataset available on {url}. Error: {str(e)}") - - # make directory and extract files - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - self.instance_dir.mkdir(parents=True, exist_ok=True) - - # Extract files - for file_info in zip_ref.infolist(): - filename = pathlib.Path(file_info.filename).name - with zip_ref.open(file_info) as source, open(self.instance_dir / filename, 'wb') as target: - target.write(source.read()) + dataset_dir = pathlib.Path(os.path.join(root, "nurserostering")) - # Clean up the zip file - zip_path.unlink() - - - def __len__(self) -> int: - """Return the total number of instances.""" - return len(list(self.instance_dir.glob("*.txt"))) - - def __getitem__(self, index: int) -> Tuple[Any, Any]: - """ - Get a single Nurserostering instance filename and metadata. + super().__init__( + dataset_dir=dataset_dir, + transform=transform, target_transform=target_transform, + download=download, extension=".txt" + ) - Args: - index (int): Index of the instance to retrieve + def category(self) -> dict: + return {} # no categories - Returns: - Tuple[Any, Any]: A tuple containing: - - The filename of the instance - - Metadata dictionary with file name, track, year etc. - """ - if isinstance(index, int) and not (0 <= index < len(self)): - raise IndexError("Index out of range") + def download(self): + print(f"Downloading Nurserostering instances from schedulingbenchmarks.org") + + url = "https://schedulingbenchmarks.org/nrp/data/" + target = "instances1_24.zip" # download full repo... + zip_path = self.root / target - # Get all instance files and sort for deterministic behavior - files = self.sort_key(list(self.instance_dir.glob("*.txt"))) # use .txt files instead of xml files - file_path = files[index] + print(f"Downloading Nurserostering instances from schedulingbenchmarks.org") - filename = str(file_path) - if self.transform: - # user might want to process the filename to something else - filename = self.transform(filename) + try: + zip_path = self._download_file(url, target, destination=str(zip_path)) + except ValueError as e: + raise ValueError(f"No dataset available on {url}. Error: {str(e)}") - metadata = dict(name=file_path.stem) + # make directory and extract files + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + self.dataset_dir.mkdir(parents=True, exist_ok=True) - if self.target_transform: - metadata = self.target_transform(metadata) + # Extract files + for file_info in zip_ref.infolist(): + filename = pathlib.Path(file_info.filename).name + with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: + target.write(source.read()) - return filename, metadata + # Clean up the zip file + zip_path.unlink() def open(self, instance: os.PathLike) -> callable: return open(instance, "r") @@ -287,7 +260,7 @@ def parse_scheduling_period(filename: str): shift_on=shift_on, shift_off=shift_off, cover=cover) -def add_fake_names(data, seed=0): +def _add_fake_names(data, seed=0): """ Transform function to add randomly generated names to staff using Faker. @@ -331,7 +304,7 @@ def add_fake_names(data, seed=0): return data -def to_dataframes(data): +def _to_dataframes(data): """ Transform function to convert native data structures to pandas DataFrames. From 61b0c426c81ad94ec6e08d199a3d4dfe7745452d Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 30 Jan 2026 11:08:52 +0100 Subject: [PATCH 084/152] Improve all datasets --- cpmpy/tools/dataset/_base.py | 5 +- cpmpy/tools/dataset/jsplib.py | 90 ++++++++++---------- cpmpy/tools/dataset/miplib.py | 58 ++++++------- cpmpy/tools/dataset/mse.py | 34 ++++---- cpmpy/tools/dataset/nurserostering.py | 21 ++--- cpmpy/tools/dataset/opb.py | 32 ++++---- cpmpy/tools/dataset/psplib.py | 113 +++++++++----------------- cpmpy/tools/dataset/xcsp3.py | 81 +++++++----------- 8 files changed, 183 insertions(+), 251 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index 6d168a205..3f4482644 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -9,6 +9,7 @@ from abc import ABC, abstractmethod import os import pathlib +import io import tempfile from typing import Any, Optional, Tuple from urllib.error import URLError @@ -78,13 +79,13 @@ def download(self, *args, **kwargs): """ pass - def open(self, instance) -> callable: + def open(self, instance) -> io.TextIOBase: """ How an instance file from the dataset should be opened. Especially usefull when files come compressed and won't work with python standard library's 'open', e.g. '.xz', '.lzma'. """ - return open + return open(instance, "r") def metadata(self, file) -> dict: metadata = self.category() | { diff --git a/cpmpy/tools/dataset/jsplib.py b/cpmpy/tools/dataset/jsplib.py index 876e7fe32..b783e9660 100644 --- a/cpmpy/tools/dataset/jsplib.py +++ b/cpmpy/tools/dataset/jsplib.py @@ -6,19 +6,19 @@ https://github.com/tamy0612/JSPLIB """ + +import io import os import json import pathlib -from os.path import join from typing import Tuple, Any -from urllib.request import urlretrieve -from urllib.error import HTTPError, URLError import zipfile import numpy as np import cpmpy as cp from cpmpy.tools.dataset._base import _Dataset + class JSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible """ @@ -41,49 +41,49 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl """ self.root = pathlib.Path(root) - self.instance_dir = pathlib.Path(join(self.root, "jsplib")) self.metadata_file = "instances.json" - self.transform = transform - self.target_transform = target_transform - # Create root directory if it doesn't exist - self.root.mkdir(parents=True, exist_ok=True) + dataset_dir = self.root / self.name - if not self.instance_dir.exists(): - if not download: - raise ValueError(f"Dataset not found in local file system. Please set download=True to download the dataset.") - else: - url = f"https://github.com/tamy0612/JSPLIB/archive/refs/heads/master.zip" # download full repo... - url_path = url - zip_path = pathlib.Path(join(root,"jsplib-master.zip")) - - print(f"Downloading JSPLib instances..") - - try: - urlretrieve(url_path, str(zip_path)) - except (HTTPError, URLError) as e: - raise ValueError(f"No dataset available on {url}. Error: {str(e)}") - - # make directory and extract files - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - self.instance_dir.mkdir(parents=True, exist_ok=True) - - # Extract files - for file_info in zip_ref.infolist(): - if file_info.filename.startswith("JSPLIB-master/instances/") and file_info.file_size > 0: - filename = pathlib.Path(file_info.filename).name - with zip_ref.open(file_info) as source, open(self.instance_dir / filename, 'wb') as target: - target.write(source.read()) - # extract metadata file - with zip_ref.open("JSPLIB-master/instances.json") as source, open(self.instance_dir / self.metadata_file, 'wb') as target: - target.write(source.read()) - # Clean up the zip file - zip_path.unlink() + super().__init__( + dataset_dir=dataset_dir, + transform=transform, target_transform=target_transform, + download=download, extension="" + ) + + def category(self) -> dict: + return {} # no categories + + def download(self): + + url = "https://github.com/tamy0612/JSPLIB/archive/refs/heads/" # download full repo... + target = "master.zip" + target_download_path = self.root / target + print(f"Downloading JSPLib instances from github.com/tamy0612/JSPLIB") + + try: + target_download_path = self._download_file(url, target, destination=str(target_download_path)) + except ValueError as e: + raise ValueError(f"No dataset available on {url}. Error: {str(e)}") + + # Extract files + with zipfile.ZipFile(target_download_path, 'r') as zip_ref: + self.dataset_dir.mkdir(parents=True, exist_ok=True) + + # Extract files + for file_info in zip_ref.infolist(): + if file_info.filename.startswith("JSPLIB-master/instances/") and file_info.file_size > 0: + filename = pathlib.Path(file_info.filename).name + with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: + target.write(source.read()) + # extract metadata file + with zip_ref.open("JSPLIB-master/instances.json") as source, open(self.dataset_dir / self.metadata_file, 'wb') as target: + target.write(source.read()) - def __len__(self) -> int: - """Return the total number of instances.""" - return len(list(self.instance_dir.glob("*"))) + # Clean up the zip file + target_download_path.unlink() + def __getitem__(self, index: int|str) -> Tuple[Any, Any]: """ @@ -101,7 +101,7 @@ def __getitem__(self, index: int|str) -> Tuple[Any, Any]: raise IndexError("Index out of range") # Get all instance files and sort for deterministic behavior # TODO: use natsort instead? - files = sorted(list(self.instance_dir.glob("*[!.json]"))) # exclude metadata file + files = sorted(list(self.dataset_dir.rglob("*[!.json]"))) # exclude metadata file if isinstance(index, int): file_path = files[index] elif isinstance(index, str): @@ -116,7 +116,7 @@ def __getitem__(self, index: int|str) -> Tuple[Any, Any]: # does not need to remain a filename... filename = self.transform(filename) - with open(self.instance_dir / self.metadata_file, "r") as f: + with open(self.dataset_dir / self.metadata_file, "r") as f: for entry in json.load(f): if entry["name"] == file_path.stem: metadata = entry @@ -161,8 +161,12 @@ def parse_jsp(filename: str): return task_to_machines, task_durations + def jobshop_model(task_to_machines, task_durations): + """ + Create a CPMpy model for the Jobshop problem. + """ task_to_machines = np.array(task_to_machines) dur = np.array(task_durations) diff --git a/cpmpy/tools/dataset/miplib.py b/cpmpy/tools/dataset/miplib.py index 166596ae7..1df516b7c 100644 --- a/cpmpy/tools/dataset/miplib.py +++ b/cpmpy/tools/dataset/miplib.py @@ -9,11 +9,18 @@ import gzip import zipfile import pathlib +import io from cpmpy.tools.dataset._base import _Dataset class MIPLibDataset(_Dataset): # torch.utils.data.Dataset compatible + + """ + MIPLib Dataset in a PyTorch compatible format. + + More information on MIPLib can be found here: https://miplib.zib.de/ + """ name = "miplib" @@ -25,17 +32,16 @@ def __init__( download: bool = False ): """ - Constructor for a dataset object of the MSE competition. + Constructor for a dataset object of the MIPLib competition. Arguments: root (str): Root directory where datasets are stored or will be downloaded to (default="."). - year (int): Competition year of the dataset to use (default=2024). - track (str): Track name specifying which subset of the competition instances to load (default="exact-unweighted"). + year (int): Year of the dataset to use (default=2024). + track (str): Track name specifying which subset of the dataset instances to load (default="exact-unweighted"). transform (callable, optional): Optional transform applied to the instance file path. target_transform (callable, optional): Optional transform applied to the metadata dictionary. download (bool): If True, downloads the dataset if it does not exist locally (default=False). - Raises: ValueError: If the dataset directory does not exist and `download=False`, or if the requested year/track combination is not available. @@ -45,13 +51,7 @@ def __init__( self.year = year self.track = track - # # Check requested dataset - # if not str(year).startswith('20'): - # raise ValueError("Year must start with '20'") - # if not track: - # raise ValueError("Track must be specified, e.g. OPT-LIN, DEC-LIN, ...") - - dataset_dir = self.root / "miplib" + dataset_dir = self.root / self.name / str(year) / track super().__init__( dataset_dir=dataset_dir, @@ -59,53 +59,47 @@ def __init__( download=download, extension=".mps.gz" ) - def category(self) -> dict: return { "year": self.year, "track": self.track } - def download(self): - print("Downloading MIPLib instances...") - zip_name = "collection.zip" url = "https://miplib.zib.de/downloads/" + target = "collection.zip" + target_download_path = self.root / target - dataset_dir = self.root / "miplib" + print(f"Downloading MIPLib instances from miplib.zib.de") - if dataset_dir.exists(): - print(f"Using existing dataset directory: {dataset_dir}") - else: - print(f"Downloading {zip_name}...") - try: - cached_filepath = super().download_file(url, target=zip_name, desc=zip_name) - except ValueError as e: - raise ValueError(f"No dataset available. Error: {str(e)}") + try: + target_download_path = self._download_file(url, target, destination=str(target_download_path)) + except ValueError as e: + raise ValueError(f"No dataset available on {url}. Error: {str(e)}") - # Extract only the specific track folder from the tar - with zipfile.ZipFile(cached_filepath, 'r') as zip_ref: - # Create track folder in root directory, parents=True ensures recursive creation + # Extract files + with zipfile.ZipFile(target_download_path, 'r') as zip_ref: self.dataset_dir.mkdir(parents=True, exist_ok=True) # Extract files for file_info in zip_ref.infolist(): - # Extract file to family_dir, removing main_folder/track prefix filename = pathlib.Path(file_info.filename).name with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: target.write(source.read()) - # Do not cleanup cached file, as it is in the global cache directory - # zip_path.unlink() - def open(self, instance: os.PathLike) -> callable: + # Clean up the zip file + target_download_path.unlink() + + def open(self, instance: os.PathLike) -> io.TextIOBase: return gzip.open(instance, "rt") if str(instance).endswith(".gz") else open(instance) + if __name__ == "__main__": dataset = MIPLibDataset(download=True) print("Dataset size:", len(dataset)) print("Instance 0:", dataset[0]) - from cpmpy.tools.mps import read_mps + from cpmpy.tools.io.mps import read_mps model = read_mps(dataset[0][0], open=dataset.open) print(model) diff --git a/cpmpy/tools/dataset/mse.py b/cpmpy/tools/dataset/mse.py index b498d09a9..dd1fcc163 100644 --- a/cpmpy/tools/dataset/mse.py +++ b/cpmpy/tools/dataset/mse.py @@ -9,12 +9,13 @@ import lzma import zipfile import pathlib -from urllib.request import urlretrieve -from urllib.error import HTTPError, URLError +import io + +from cpmpy.tools.dataset._base import _Dataset -from .._base import _Dataset class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible + """ MaxSAT Evaluation (MSE) benchmark dataset. @@ -63,7 +64,7 @@ def __init__( if not track: raise ValueError("Track must be specified, e.g. OPT-LIN, DEC-LIN, ...") - dataset_dir = self.root / str(year) / track + dataset_dir = self.root / self.name / str(year) / track super().__init__( dataset_dir=dataset_dir, @@ -71,30 +72,27 @@ def __init__( download=download, extension=".wcnf.xz" ) - def category(self) -> dict: return { "year": self.year, "track": self.track } - def download(self): - print(f"Downloading MaxSAT Eval {self.year} {self.track} instances...") - - zip_name = f"mse{str(self.year)[2:]}-{self.track}.zip" + url = f"https://www.cs.helsinki.fi/group/coreo/MSE{self.year}-instances/" + target = f"mse{str(self.year)[2:]}-{self.track}.zip" + target_download_path = self.root / target - url_path = url + zip_name - zip_path = self.root / zip_name - + print(f"Downloading MaxSAT Eval {self.year} {self.track} instances from cs.helsinki.fi") + try: - urlretrieve(url_path, str(zip_path)) - except (HTTPError, URLError) as e: + target_download_path = self._download_file(url, target, destination=str(target_download_path)) + except ValueError as e: raise ValueError(f"No dataset available for year {self.year} and track {self.track}. Error: {str(e)}") # Extract only the specific track folder from the tar - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + with zipfile.ZipFile(target_download_path, 'r') as zip_ref: # Create track folder in root directory, parents=True ensures recursive creation self.dataset_dir.mkdir(parents=True, exist_ok=True) @@ -104,12 +102,14 @@ def download(self): filename = pathlib.Path(file_info.filename).name with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: target.write(source.read()) + # Clean up the zip file - zip_path.unlink() + target_download_path.unlink() - def open(self, instance: os.PathLike) -> callable: + def open(self, instance: os.PathLike) -> io.TextIOBase: return lzma.open(instance, "rt") if str(instance).endswith(".xz") else open(instance) + if __name__ == "__main__": dataset = MSEDataset(year=2024, track="exact-weighted", download=True) print("Dataset size:", len(dataset)) diff --git a/cpmpy/tools/dataset/nurserostering.py b/cpmpy/tools/dataset/nurserostering.py index 401a13922..2044ccef7 100644 --- a/cpmpy/tools/dataset/nurserostering.py +++ b/cpmpy/tools/dataset/nurserostering.py @@ -9,11 +9,9 @@ import os import pathlib -from typing import Tuple, Any -from urllib.request import urlretrieve -from urllib.error import HTTPError, URLError import zipfile import re +import io import cpmpy as cp from cpmpy.tools.dataset._base import _Dataset @@ -57,11 +55,9 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl """ self.root = pathlib.Path(root) - self.transform = transform - self.target_transform = target_transform self.sort_key = sorted if sort_key is None else sort_key - dataset_dir = pathlib.Path(os.path.join(root, "nurserostering")) + dataset_dir = self.root / self.name super().__init__( dataset_dir=dataset_dir, @@ -73,21 +69,20 @@ def category(self) -> dict: return {} # no categories def download(self): - print(f"Downloading Nurserostering instances from schedulingbenchmarks.org") url = "https://schedulingbenchmarks.org/nrp/data/" target = "instances1_24.zip" # download full repo... - zip_path = self.root / target + target_download_path = self.root / target print(f"Downloading Nurserostering instances from schedulingbenchmarks.org") try: - zip_path = self._download_file(url, target, destination=str(zip_path)) + target_download_path = self._download_file(url, target, destination=str(target_download_path)) except ValueError as e: raise ValueError(f"No dataset available on {url}. Error: {str(e)}") # make directory and extract files - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + with zipfile.ZipFile(target_download_path, 'r') as zip_ref: self.dataset_dir.mkdir(parents=True, exist_ok=True) # Extract files @@ -96,10 +91,10 @@ def download(self): with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: target.write(source.read()) - # Clean up the zip file - zip_path.unlink() + # Clean up the zip file + target_download_path.unlink() - def open(self, instance: os.PathLike) -> callable: + def open(self, instance: os.PathLike) -> io.TextIOBase: return open(instance, "r") diff --git a/cpmpy/tools/dataset/opb.py b/cpmpy/tools/dataset/opb.py index 43a8dcfa8..2d0d876ba 100644 --- a/cpmpy/tools/dataset/opb.py +++ b/cpmpy/tools/dataset/opb.py @@ -8,9 +8,8 @@ import lzma import os import pathlib -from urllib.request import urlretrieve -from urllib.error import HTTPError, URLError import tarfile +import io from cpmpy.tools.dataset._base import _Dataset @@ -34,7 +33,7 @@ def __init__( self, root: str = ".", year: int = 2024, track: str = "OPT-LIN", - competition: bool = False, + competition: bool = True, transform=None, target_transform=None, download: bool = False ): @@ -67,7 +66,7 @@ def __init__( if not track: raise ValueError("Track must be specified, e.g. exact-weighted, exact-unweighted, ...") - dataset_dir = self.root / str(year) / track / ('selected' if self.competition else 'normalized') + dataset_dir = self.root / self.name / str(year) / track / ('selected' if self.competition else 'normalized') super().__init__( dataset_dir=dataset_dir, @@ -85,23 +84,21 @@ def metadata(self, file) -> dict: # Add the author to the metadata return super().metadata(file) | {'author': str(file).split(os.sep)[-1].split("_")[0],} - def download(self): - # TODO: add option to filter on competition instances - print(f"Downloading OPB {self.year} {self.track} {'competition' if self.competition else 'non-competition'} instances...") - - url = f"https://www.cril.univ-artois.fr/PB24/benchs/" - year_suffix = str(self.year)[2:] # Drop the starting '20' - url_path = url + f"{'normalized' if not self.competition else 'selected'}-PB{year_suffix}.tar" - tar_path = self.root / f"{'normalized' if not self.competition else 'selected'}-PB{year_suffix}.tar" + + url = "https://www.cril.univ-artois.fr/PB24/benchs/" + target = f"{'normalized' if not self.competition else 'selected'}-PB{str(self.year)[2:]}.tar" + target_download_path = self.root / target + + print(f"Downloading OPB {self.year} {self.track} {'competition' if self.competition else 'non-competition'} instances from www.cril.univ-artois.fr") try: - urlretrieve(url_path, str(tar_path)) - except (HTTPError, URLError) as e: + target_download_path = self._download_file(url, target, destination=str(target_download_path)) + except ValueError as e: raise ValueError(f"No dataset available for year {self.year}. Error: {str(e)}") # Extract only the specific track folder from the tar - with tarfile.open(tar_path, "r:*") as tar_ref: # r:* handles .tar, .tar.gz, .tar.bz2, etc. + with tarfile.open(target_download_path, "r:*") as tar_ref: # r:* handles .tar, .tar.gz, .tar.bz2, etc. # Get the main folder name main_folder = None for name in tar_ref.getnames(): @@ -157,11 +154,12 @@ def download(self): target.write(source.read()) # Clean up the tar file - tar_path.unlink() + target_download_path.unlink() - def open(self, instance: os.PathLike) -> callable: + def open(self, instance: os.PathLike) -> io.TextIOBase: return lzma.open(instance, 'rt') if str(instance).endswith(".xz") else open(instance) + if __name__ == "__main__": dataset = OPBDataset(year=2024, track="DEC-LIN", competition=True, download=True) print("Dataset size:", len(dataset)) diff --git a/cpmpy/tools/dataset/psplib.py b/cpmpy/tools/dataset/psplib.py index cf47f7c47..6d3e532e9 100644 --- a/cpmpy/tools/dataset/psplib.py +++ b/cpmpy/tools/dataset/psplib.py @@ -3,11 +3,10 @@ https://www.om-db.wi.tum.de/psplib/getdata_sm.html """ + import os import pathlib -from typing import Tuple, Any -from urllib.request import urlretrieve -from urllib.error import HTTPError, URLError +import io import zipfile from cpmpy.tools.dataset._base import _Dataset @@ -42,9 +41,6 @@ def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", self.root = pathlib.Path(root) self.variant = variant self.family = family - self.transform = transform - self.target_transform = target_transform - self.family_dir = pathlib.Path(os.path.join(self.root, variant, family)) self.families = dict( rcpsp = ["j30", "j60", "j90", "j120"] @@ -55,84 +51,49 @@ def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", raise ValueError("Only 'rcpsp' variant is supported for now") if family not in self.families[variant]: raise ValueError(f"Unknown problem family. Must be any of {','.join(self.families[variant])}") - # Create root directory if it doesn't exist - self.root.mkdir(parents=True, exist_ok=True) - if not self.family_dir.exists(): - if not download: - raise ValueError(f"Dataset for variant {variant} and family {family} not found. Please set download=True to download the dataset.") - else: - print(f"Downloading PSPLib {variant} {family} instances...") - - zip_name = f"{family}.{self.family_codes[variant]}.zip" - url = f"https://www.om-db.wi.tum.de/psplib/files/" - - url_path = url + zip_name - zip_path = self.root / zip_name - - try: - urlretrieve(url_path, str(zip_path)) - except (HTTPError, URLError) as e: - raise ValueError(f"No dataset available for variant {variant} and family {family}. Error: {str(e)}") - - # make directory and extract files - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - # Create track folder in root directory, parents=True ensures recursive creation - self.family_dir.mkdir(parents=True, exist_ok=True) - - # Extract files - for file_info in zip_ref.infolist(): - # Extract file to family_dir, removing main_folder/track prefix - filename = pathlib.Path(file_info.filename).name - with zip_ref.open(file_info) as source, open(self.family_dir / filename, 'wb') as target: - target.write(source.read()) - # Clean up the zip file - zip_path.unlink() - - def open(self, instance: os.PathLike) -> callable: - return open(instance, "r") + dataset_dir = self.root / self.name / self.variant / self.family + super().__init__( + dataset_dir=dataset_dir, + transform=transform, target_transform=target_transform, + download=download, extension=f".{self.family_codes[self.variant]}" + ) - def __len__(self) -> int: - """Return the total number of instances.""" - return len(list(self.family_dir.glob(f"*.{self.family_codes[self.variant]}"))) - - def __getitem__(self, index: int) -> Tuple[Any, Any]: - """ - Get a single RCPSP instance filename and metadata. + def category(self) -> dict: + return { + "variant": self.variant, + "family": self.family + } - Args: - index (int): Index of the instance to retrieve - - Returns: - Tuple[Any, Any]: A tuple containing: - - The filename of the instance - - Metadata dictionary with file name, track, year etc. - """ - if index < 0 or index >= len(self): - raise IndexError("Index out of range") + def download(self): - # Get all instance files and sort for deterministic behavior # TODO: use natsort instead? - files = sorted(list(self.family_dir.glob(f"*.{self.family_codes[self.variant]}"))) - file_path = files[index] + url = "https://www.om-db.wi.tum.de/psplib/files/" + target = f"{self.family}.{self.family_codes[self.variant]}.zip" + target_download_path = self.root / target + + print(f"Downloading PSPLib {self.variant} {self.family} instances from www.om-db.wi.tum.de") - filename = str(file_path) - if self.transform: - # does not need to remain a filename... - filename = self.transform(filename) - - # Basic metadata about the instance - metadata = dict( - variant = self.variant, - family = self.family, - name = file_path.stem - ) + try: + target_download_path = self._download_file(url, target, destination=str(target_download_path)) + except ValueError as e: + raise ValueError(f"No dataset available for variant {self.variant} and family {self.family}. Error: {str(e)}") - if self.target_transform: - metadata = self.target_transform(metadata) + # make directory and extract files + with zipfile.ZipFile(target_download_path, 'r') as zip_ref: + # Create track folder in root directory, parents=True ensures recursive creation + self.dataset_dir.mkdir(parents=True, exist_ok=True) - return filename, metadata - + # Extract files + for file_info in zip_ref.infolist(): + # Extract file to family_dir, removing main_folder/track prefix + filename = pathlib.Path(file_info.filename).name + with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: + target.write(source.read()) + # Clean up the zip file + target_download_path.unlink() + + if __name__ == "__main__": dataset = PSPLibDataset(variant="rcpsp", family="j30", download=True) print("Dataset size:", len(dataset)) diff --git a/cpmpy/tools/dataset/xcsp3.py b/cpmpy/tools/dataset/xcsp3.py index 042c8a9b2..2fd989da7 100644 --- a/cpmpy/tools/dataset/xcsp3.py +++ b/cpmpy/tools/dataset/xcsp3.py @@ -4,75 +4,53 @@ https://xcsp.org/instances/ """ -from functools import partial import os import lzma import zipfile import pathlib -from urllib.request import urlretrieve -from urllib.error import HTTPError, URLError +import io from cpmpy.tools.dataset._base import _Dataset -class XCSP3Dataset(_Dataset): - """ - XCSP3 benchmark dataset. - - Provides access to benchmark instances from the XCSP3 - competitions. Instances are grouped by `year` and `track` (e.g., - `"CSP"`, `"eCOP"`) and stored as `.xml.lzma` files. - If the dataset is not available locally, it can be automatically - downloaded and extracted. +class XCSP3Dataset(_Dataset): # torch.utils.data.Dataset compatible - More information on the competition can be found here: https://xcsp.org/competitions/ + """ + XCSP3 Dataset in a PyTorch compatible format. + + Arguments: + root (str): Root directory containing the XCSP3 instances (if 'download', instances will be downloaded to this location) + year (int): Competition year (2022, 2023 or 2024) + track (str, optional): Filter instances by track type (e.g., "COP", "CSP", "MiniCOP") + transform (callable, optional): Optional transform to be applied on the instance data (the file path of each problem instance) + target_transform (callable, optional): Optional transform to be applied on the metadata (the metadata dictionary of each problem instance) + download (bool): If True, downloads the dataset from the internet and puts it in `root` directory """ name = "xcsp3" - def __init__( - self, - root: str = ".", - year: int = 2023, track: str = "CSP", - transform=None, target_transform=None, - download: bool = False - ): + def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transform=None, target_transform=None, download: bool = False): """ - Constructor for a dataset object of the XCP3 competition. - - Arguments: - root (str): Root directory where datasets are stored or will be downloaded to (default="."). - year (int): Competition year of the dataset to use (default=2024). - track (str): Track name specifying which subset of the competition instances to load (default="CSP"). - transform (callable, optional): Optional transform applied to the instance file path. - target_transform (callable, optional): Optional transform applied to the metadata dictionary. - download (bool): If True, downloads the dataset if it does not exist locally (default=False). - - - Raises: - ValueError: If the dataset directory does not exist and `download=False`, - or if the requested year/track combination is not available. + Initialize the XCSP3 Dataset. """ self.root = pathlib.Path(root) self.year = year self.track = track - # Check requested dataset + dataset_dir = self.root / self.name / str(year) / track + if not str(year).startswith('20'): raise ValueError("Year must start with '20'") if not track: - raise ValueError("Track must be specified, e.g. COP, CSP, ...") - - dataset_dir = self.root / str(year) / track + raise ValueError("Track must be specified, e.g. COP, CSP, MiniCOP, ...") super().__init__( - dataset_dir=dataset_dir, + dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=".xml.lzma" ) - def category(self) -> dict: return { "year": self.year, @@ -80,20 +58,20 @@ def category(self) -> dict: } def download(self): - print(f"Downloading XCSP3 {self.year} instances...") - url = f"https://www.cril.univ-artois.fr/~lecoutre/compets/" - year_suffix = str(self.year)[2:] # Drop the starting '20' - url_path = url + f"instancesXCSP{year_suffix}.zip" - zip_path = self.root / f"instancesXCSP{year_suffix}.zip" + url = "https://www.cril.univ-artois.fr/~lecoutre/compets/" + target = f"instancesXCSP{str(self.year)[2:]}.zip" + target_download_path = self.root / target + + print(f"Downloading XCSP3 {self.year} instances from www.cril.univ-artois.fr") try: - urlretrieve(url_path, str(zip_path)) - except (HTTPError, URLError) as e: + target_download_path = self._download_file(url, target, destination=str(target_download_path)) + except ValueError as e: raise ValueError(f"No dataset available for year {self.year}. Error: {str(e)}") - + # Extract only the specific track folder from the zip - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + with zipfile.ZipFile(target_download_path, 'r') as zip_ref: # Get the main folder name (e.g., "024_V3") main_folder = None for name in zip_ref.namelist(): @@ -127,10 +105,11 @@ def download(self): filename = pathlib.Path(file_info.filename).name with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: target.write(source.read()) + # Clean up the zip file - zip_path.unlink() + target_download_path.unlink() - def open(self, instance: os.PathLike) -> callable: + def open(self, instance: os.PathLike) -> io.TextIOBase: return lzma.open(instance, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open(instance) From 0c838b636f21daae7f2b9620a8c8b36713701259 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 30 Jan 2026 11:09:07 +0100 Subject: [PATCH 085/152] add dataset classes to module --- cpmpy/tools/dataset/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpmpy/tools/dataset/__init__.py b/cpmpy/tools/dataset/__init__.py index e69de29bb..65fb041b8 100644 --- a/cpmpy/tools/dataset/__init__.py +++ b/cpmpy/tools/dataset/__init__.py @@ -0,0 +1,7 @@ +from .miplib import MIPLibDataset +from .jsplib import JSPLibDataset +from .psplib import PSPLibDataset +from .nurserostering import NurseRosteringDataset +from .xcsp3 import XCSP3Dataset +from .opb import OPBDataset +from .mse import MSEDataset From 347662058007577a786234376b2518de9d256ebd Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 30 Jan 2026 11:16:14 +0100 Subject: [PATCH 086/152] Simplify __main__ keep more detailed usage for example --- cpmpy/tools/dataset/jsplib.py | 25 ------------------------- cpmpy/tools/dataset/miplib.py | 4 ---- cpmpy/tools/dataset/nurserostering.py | 6 ++++++ cpmpy/tools/dataset/psplib.py | 4 ++-- 4 files changed, 8 insertions(+), 31 deletions(-) diff --git a/cpmpy/tools/dataset/jsplib.py b/cpmpy/tools/dataset/jsplib.py index b783e9660..7ce5b36a1 100644 --- a/cpmpy/tools/dataset/jsplib.py +++ b/cpmpy/tools/dataset/jsplib.py @@ -195,31 +195,6 @@ def jobshop_model(task_to_machines, task_durations): if __name__ == "__main__": - dataset = JSPLibDataset(root=".", download=True, transform=parse_jsp) print("Dataset size:", len(dataset)) print("Instance 0:") - (machines, dur), metadata = dataset[0] - print("Machines:", machines) - print("Durations:", dur) - print("Metadata:", metadata) - - print("Solving", metadata['name']) - model, (start, makespan) = jobshop_model(task_to_machines=machines, task_durations=dur) - assert model.solve(time_limit=10) - - import pandas as pd - import plotly.express as px - import plotly.io as pio - pio.renderers.default = "browser" # ensure plotly opens figure in browser - - df = pd.DataFrame({"Start": start.value().flat, "Duration": dur.flat, "Machine": machines.flat}) - df["Job"] = [j for j in range(metadata['jobs']) for _ in range(metadata['machines']) ] - df["Task"] = [j for _ in range(metadata['machines']) for j in range(metadata['jobs'])] - df["Name"] = "T" + df["Job"].astype(str) + "-" + df["Task"].astype(str) - print(df) - ghant_fig = px.bar(df, orientation='h', - base="Start", x="Duration", y="Machine", color="Job", text="Name", - title=f"Jobshop instance {metadata['name']}, makespan: {makespan.value()}, status: {model.status()}" - ) - ghant_fig.show() \ No newline at end of file diff --git a/cpmpy/tools/dataset/miplib.py b/cpmpy/tools/dataset/miplib.py index 1df516b7c..f80634e28 100644 --- a/cpmpy/tools/dataset/miplib.py +++ b/cpmpy/tools/dataset/miplib.py @@ -99,7 +99,3 @@ def open(self, instance: os.PathLike) -> io.TextIOBase: dataset = MIPLibDataset(download=True) print("Dataset size:", len(dataset)) print("Instance 0:", dataset[0]) - - from cpmpy.tools.io.mps import read_mps - model = read_mps(dataset[0][0], open=dataset.open) - print(model) diff --git a/cpmpy/tools/dataset/nurserostering.py b/cpmpy/tools/dataset/nurserostering.py index 2044ccef7..8c23d9a45 100644 --- a/cpmpy/tools/dataset/nurserostering.py +++ b/cpmpy/tools/dataset/nurserostering.py @@ -515,3 +515,9 @@ def nurserostering_model(horizon, shifts, staff, days_off, shift_on, shift_off, for d, val in enumerate(row): print(f" {str(val):>{col_widths[d]}}", end="") print() + + +if __name__ == "__main__": + dataset = NurseRosteringDataset(download=True) + print("Dataset size:", len(dataset)) + print("Instance 0:", dataset[0]) \ No newline at end of file diff --git a/cpmpy/tools/dataset/psplib.py b/cpmpy/tools/dataset/psplib.py index 6d3e532e9..ac685976a 100644 --- a/cpmpy/tools/dataset/psplib.py +++ b/cpmpy/tools/dataset/psplib.py @@ -92,8 +92,8 @@ def download(self): target.write(source.read()) # Clean up the zip file target_download_path.unlink() - - + + if __name__ == "__main__": dataset = PSPLibDataset(variant="rcpsp", family="j30", download=True) print("Dataset size:", len(dataset)) From 5fbc2eecf5f03434bd03dadfd70fded2e9615ac2 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 30 Jan 2026 13:57:23 +0100 Subject: [PATCH 087/152] small tweaks to readers and writers --- cpmpy/tools/io/__init__.py | 13 +++---- cpmpy/tools/io/jsplib.py | 6 +++- cpmpy/tools/io/mps.py | 30 ---------------- cpmpy/tools/io/nurserostering.py | 7 +++- cpmpy/tools/io/opb.py | 2 +- cpmpy/tools/io/rcpsp.py | 6 +++- cpmpy/tools/io/reader.py | 8 ++--- cpmpy/tools/io/scip.py | 59 ++++++++++++++++++++++++-------- cpmpy/tools/io/utils.py | 2 +- cpmpy/tools/io/wcnf.py | 6 +++- cpmpy/tools/io/writer.py | 3 +- 11 files changed, 80 insertions(+), 62 deletions(-) delete mode 100644 cpmpy/tools/io/mps.py diff --git a/cpmpy/tools/io/__init__.py b/cpmpy/tools/io/__init__.py index a79a2d3ea..b1ede66e4 100644 --- a/cpmpy/tools/io/__init__.py +++ b/cpmpy/tools/io/__init__.py @@ -4,19 +4,20 @@ This module provides tools to read and write models in various formats. Use the generic `read(..., format="...")` and `write(..., format="...")` functions to read and write models in one of the supported formats. + +Some formats can be auto-detected from the file extension. """ from .writer import write, write_formats from .reader import read, read_formats from .utils import get_extension, get_format -from .jsplib import read_jsplib -# TODO: this tool is just a wrapper around read_scip and write_scip, -# do we want such a wrapper for each format scip provides? -# You can already use the generic `read()` and `write()` to read and write any format scip provides. -from .mps import read_mps, write_mps +# Problem datasets +from .jsplib import read_jsplib from .nurserostering import read_nurserostering -from .opb import read_opb from .rcpsp import read_rcpsp + +# Model datasets +from .opb import read_opb from .scip import read_scip, write_scip from .wcnf import read_wcnf \ No newline at end of file diff --git a/cpmpy/tools/io/jsplib.py b/cpmpy/tools/io/jsplib.py index 11c820faa..7f1c13c1a 100644 --- a/cpmpy/tools/io/jsplib.py +++ b/cpmpy/tools/io/jsplib.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## jsplib.py +## """ Parser for the JSPLib format. @@ -15,7 +20,6 @@ import os import sys -import lzma import argparse import cpmpy as cp import numpy as np diff --git a/cpmpy/tools/io/mps.py b/cpmpy/tools/io/mps.py deleted file mode 100644 index fef0e87dd..000000000 --- a/cpmpy/tools/io/mps.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -MPS parser. - -This file implements helper functions for reading and writing MPS-formatted LP/MIP models. - -================= -List of functions -================= - -.. autosummary:: - :nosignatures: - - read_mps - write_mps -""" - - -from typing import Optional, Union -import os - -import cpmpy as cp -from cpmpy.tools.scip.parser import read_scip - - -def read_mps(mps: Union[str, os.PathLike], open=open, assume_integer:bool=False) -> cp.Model: - return read_scip(mps, open, assume_integer) - -def write_mps(model: cp.Model, file_path: Optional[str] = None) -> str: - pass - diff --git a/cpmpy/tools/io/nurserostering.py b/cpmpy/tools/io/nurserostering.py index 17547faf9..89e292085 100644 --- a/cpmpy/tools/io/nurserostering.py +++ b/cpmpy/tools/io/nurserostering.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## nurserostering.py +## """ Parser for the Nurse Rostering format. @@ -20,7 +25,7 @@ import cpmpy as cp from typing import Union -from cpmpy.tools.dataset.problem.nurserostering import ( +from cpmpy.tools.dataset.nurserostering import ( parse_scheduling_period, nurserostering_model ) diff --git a/cpmpy/tools/io/opb.py b/cpmpy/tools/io/opb.py index f63db7c7d..8d6910569 100644 --- a/cpmpy/tools/io/opb.py +++ b/cpmpy/tools/io/opb.py @@ -1,7 +1,7 @@ #!/usr/bin/env python #-*- coding:utf-8 -*- ## -## __init__.py +## opb.py ## """ OPB parser. diff --git a/cpmpy/tools/io/rcpsp.py b/cpmpy/tools/io/rcpsp.py index cadc32482..84aa29afa 100644 --- a/cpmpy/tools/io/rcpsp.py +++ b/cpmpy/tools/io/rcpsp.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## rcpsp.py +## """ Parser for the PSPLIB RCPSP format. @@ -15,7 +20,6 @@ import os import sys -import lzma import argparse import cpmpy as cp from io import StringIO diff --git a/cpmpy/tools/io/reader.py b/cpmpy/tools/io/reader.py index 52631a6e4..12a4c7c82 100644 --- a/cpmpy/tools/io/reader.py +++ b/cpmpy/tools/io/reader.py @@ -10,17 +10,14 @@ read read_formats - -============== -Module details -============== """ from typing import Callable, List, Optional import cpmpy as cp -from cpmpy.tools.scip.parser import read_scip from cpmpy.tools.dimacs import read_dimacs +from cpmpy.tools.io.scip import read_scip +from cpmpy.tools.io.wcnf import read_wcnf from cpmpy.tools.io.utils import get_format # mapping format names to appropriate reader functions @@ -32,6 +29,7 @@ "gms": read_scip, "pip": read_scip, "dimacs": read_dimacs, + "wcnf": read_wcnf, } diff --git a/cpmpy/tools/io/scip.py b/cpmpy/tools/io/scip.py index 52a5f881b..3ea825c52 100644 --- a/cpmpy/tools/io/scip.py +++ b/cpmpy/tools/io/scip.py @@ -1,5 +1,10 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## scip.py +## """ -This file implements helper functions for exporting CPMpy models from and to various data +This file implements helper functions for converting CPMpy models to and from various data formats supported by the SCIP optimization suite. ============ @@ -12,15 +17,6 @@ $ pip install cpmpy[io.scip] -=============== -List of classes -=============== - -.. autosummary:: - :nosignatures: - - CPM_scip - ================= List of functions ================= @@ -31,15 +27,13 @@ read_scip write_scip to_scip - -============== -Module details -============== """ +import argparse import math import os +import sys import tempfile import numpy as np import cpmpy as cp @@ -562,4 +556,41 @@ def write_scip(model: cp.Model, fname: Optional[str] = None, format: str = "mps" with open(fname, "r") as f: return f.read() +def main(): + parser = argparse.ArgumentParser(description="Parse and solve a SCIP compatible model using CPMpy") + parser.add_argument("model", help="Path to a SCIP compatible file (or raw string if --string is given)") + parser.add_argument("-s", "--solver", default=None, help="Solver name to use (default: CPMpy's default)") + parser.add_argument("--string", action="store_true", help="Interpret the first argument (model) as a raw OPB string instead of a file path") + parser.add_argument("-t", "--time-limit", type=int, default=None, help="Time limit for the solver in seconds (default: no limit)") + args = parser.parse_args() + + # Build the CPMpy model + try: + if args.string: + model = read_scip(args.model) + else: + model = read_scip(os.path.expanduser(args.model)) + except Exception as e: + sys.stderr.write(f"Error reading model: {e}\n") + sys.exit(1) + + # Solve the model + try: + if args.solver: + result = model.solve(solver=args.solver, time_limit=args.time_limit) + else: + result = model.solve(time_limit=args.time_limit) + except Exception as e: + sys.stderr.write(f"Error solving model: {e}\n") + sys.exit(1) + + # Print results + print("Status:", model.status()) + if result is not None: + if model.has_objective(): + print("Objective:", model.objective_value()) + else: + print("No solution found.") +if __name__ == "__main__": + main() diff --git a/cpmpy/tools/io/utils.py b/cpmpy/tools/io/utils.py index a31ad8d98..85a1ca9fa 100644 --- a/cpmpy/tools/io/utils.py +++ b/cpmpy/tools/io/utils.py @@ -9,7 +9,7 @@ "fzn" : "fzn", "gms" : "gms", "pip" : "pip", - "wcnf" : "dimacs", + "wcnf" : "wcnf", "cnf" : "dimacs", } diff --git a/cpmpy/tools/io/wcnf.py b/cpmpy/tools/io/wcnf.py index 84b484979..5cea77608 100644 --- a/cpmpy/tools/io/wcnf.py +++ b/cpmpy/tools/io/wcnf.py @@ -1,3 +1,8 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## wcnf.py +## """ Parser for the WCNF format. @@ -15,7 +20,6 @@ import os import sys -import lzma import argparse import cpmpy as cp from io import StringIO diff --git a/cpmpy/tools/io/writer.py b/cpmpy/tools/io/writer.py index aebcb2d8f..baa32f3d9 100644 --- a/cpmpy/tools/io/writer.py +++ b/cpmpy/tools/io/writer.py @@ -21,8 +21,8 @@ from functools import partial import cpmpy as cp -from cpmpy.tools.scip.parser import write_scip from cpmpy.tools.dimacs import write_dimacs +from cpmpy.tools.io.scip import write_scip # mapping format names to appropriate writer functions _writer_map = { @@ -40,6 +40,7 @@ # "wbo": partial(write_scip, format="wbo"), # requires SIMPL, not included in pip package # "zpl": partial(write_scip, format="zpl"), # requires SIMPL, not included in pip package "dimacs": write_dimacs, + # "wcnf": write_wcnf, # currently not supported } def _get_writer(format: str) -> Callable: From c4e6b3bfa49c096dc1cfc8e082d50a2f41f60e4d Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 30 Jan 2026 14:06:36 +0100 Subject: [PATCH 088/152] Add Ignace's opb writer --- cpmpy/tools/io/opb.py | 230 +++++++++++++++++++++++++++++++++++- cpmpy/tools/io/reader.py | 2 + cpmpy/tools/io/utils.py | 1 + cpmpy/tools/io/writer.py | 2 + cpmpy/tools/opb/writer.py | 239 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 472 insertions(+), 2 deletions(-) create mode 100644 cpmpy/tools/opb/writer.py diff --git a/cpmpy/tools/io/opb.py b/cpmpy/tools/io/opb.py index 8d6910569..949ad4f44 100644 --- a/cpmpy/tools/io/opb.py +++ b/cpmpy/tools/io/opb.py @@ -23,14 +23,27 @@ import os import re import sys -import lzma import argparse -import cpmpy as cp from io import StringIO from typing import Union from functools import reduce from operator import mul + +import cpmpy as cp +from cpmpy.transformations.normalize import toplevel_list,simplify_boolean +from cpmpy.transformations.safening import no_partial_functions, safen_objective +from cpmpy.transformations.decompose_global import decompose_in_tree, decompose_objective +from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective +from cpmpy.transformations.reification import only_implies, only_bv_reifies +from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv_wsum +from cpmpy.transformations.int2bool import int2bool, _encode_lin_expr +from cpmpy.transformations.get_variables import get_variables +from cpmpy.expressions.variables import _IntVarImpl, NegBoolView, _BoolVarImpl +from cpmpy.expressions.core import Operator, Comparison +from cpmpy import __version__ + + # Regular expressions HEADER_RE = re.compile(r'(.*)\s*#variable=\s*(\d+)\s*#constraint=\s*(\d+).*') TERM_RE = re.compile(r"([+-]?\d+)((?:\s+~?x\d+)+)") @@ -187,6 +200,219 @@ def read_opb(opb: Union[str, os.PathLike], open=open) -> cp.Model: return model +def write_opb(model, fname=None, encoding="auto"): + """ + Export a CPMpy model to the OPB (Pseudo-Boolean) format. + + This function transforms the given CPMpy model into OPB format, which is a standard textual + format for representing Pseudo-Boolean optimization problems. The OPB file will contain + a header specifying the number of variables and constraints, the objective (optional), and the + list of constraints using integer-weighted Boolean variables. + + Args: + model (cp.Model): The CPMpy model to export. + fname (str, optional): The file name to write the OPB output to. If None, the OPB string is returned. + encoding (str, optional): The encoding used for `int2bool`. Options: ("auto", "direct", "order", "binary"). + + Returns: + str or None: The OPB string if `fname` is None, otherwise nothing (writes to file). + + Format: + * #variable= #constraint= + * OPB file generated by CPMpy version + min/max: ; + ; + ; + ... + + Note: + Some solvers only support variable names of the form x. The OPB writer will remap + all CPMpy variables to such a format internally. + + Example: + >>> from cpmpy import * + >>> x = boolvar(shape=3) + >>> m = Model(x[0] + x[1] + x[2] >= 2) + >>> print(write_opb(m)) + """ + + csemap, ivarmap = dict(), dict() + opb_cons = _transform(model.constraints, csemap, ivarmap, encoding) + + if model.objective_ is not None: + opb_obj, const, extra_cons = _transform_objective(model.objective_, csemap, ivarmap, encoding) + opb_cons += extra_cons + else: + opb_obj = None + + # Form header and variable mapping + # Use all variables occurring in constraints and the objective + all_vars = get_variables(opb_cons + ([opb_obj] if opb_obj is not None else [])) + out = [ + f"* #variable= {len(all_vars)} #constraint= {len(opb_cons)}", + f"* OPB file generated by CPMpy version {__version__}", + ] + # Remap variables to 'x1', 'x2', ..., the standard OPB way + varmap = {v: f"x{i+1}" for i, v in enumerate(all_vars)} + + # Write objective, if present + if model.objective_ is not None: + objective_str = _wsum_to_str(opb_obj, varmap) + out.append(f"{'min' if model.objective_is_min else 'max'}: {objective_str};") + + # Write constraints + for cons in opb_cons: + assert isinstance(cons, Comparison), f"Expected a comparison, but got {cons}" + lhs, rhs = cons.args + constraint_str = f"{_wsum_to_str(lhs, varmap)} {cons.name} {rhs};" + out.append(constraint_str) + + # Output to file or string + contents = "\n".join(out) + if fname is None: + return contents + else: + with open(fname, "w") as f: + f.write(contents) + +def _normalized_comparison(lst_of_expr): + """ + Convert a list of linear CPMpy expressions into OPB-compatible pseudo-Boolean constraints. + + Transforms a list of Boolean-linear CPMpy expressions (as output by `linearize_constraint`) into a list + of OPB-normalized constraints, expressed as comparisons between weighted Boolean sums + (using "wsum") and integer constants. Handles Boolean vars, reifications, implications, + and ensures all equalities are decomposed into two inequalities. + + Args: + lst_of_expr (list): List of CPMpy Boolean-linear expressions. + + Returns: + list: List of normalized CPMpy `Comparison` objects representing pseudo-Boolean constraints. + """ + newlist = [] + for cpm_expr in lst_of_expr: + if isinstance(cpm_expr, cp.BoolVal) and cpm_expr.value() is False: + raise NotImplementedError(f"Cannot transform {cpm_expr} to OPB constraint") + + # single Boolean variable + if isinstance(cpm_expr, _BoolVarImpl): + cpm_expr = Operator("sum", [cpm_expr]) >= 1 + + # implication + if isinstance(cpm_expr, Operator) and cpm_expr.name == "->": + bv, subexpr = cpm_expr.args + assert isinstance(subexpr, _BoolVarImpl), "Only bv -> bv should reach here, but got {subexpr}" + cpm_expr = Operator("wsum", [[-1, 1], [bv, subexpr]]) >= 0 + newlist.append(cpm_expr) + continue + + # Comparison, can be single Boolean variable or (weighted) sum of Boolean variables + if isinstance(cpm_expr, Comparison): + lhs, rhs = cpm_expr.args + + if isinstance(lhs, _BoolVarImpl): + lhs = Operator("sum", [lhs]) + if lhs.name == "sum": + lhs = Operator("wsum", [[1]*len(lhs.args), lhs.args]) + + assert isinstance(lhs, Operator) and lhs.name == "wsum", f"Expected a wsum, but got {lhs}" + + # convert comparisons into >= constraints + if cpm_expr.name == "==": + newlist += _normalized_comparison([lhs <= rhs]) + newlist += _normalized_comparison([lhs >= rhs]) + elif cpm_expr.name == ">=": + newlist.append(lhs >= rhs) + elif cpm_expr.name == "<=": + new_weights = [-w for w in lhs.args[0]] + newlist.append(Operator("wsum", [new_weights, lhs.args[1]]) >= -rhs) + else: + raise ValueError(f"Unknown comparison {cpm_expr.name}") + else: + raise NotImplementedError(f"Expected a comparison, but got {cpm_expr}") + + return newlist + +def _wsum_to_str(cpm_expr, varmap): + """ + Convert a weighted sum CPMpy expression to a string in OPB format. + + args: + cpm_expr (Operator): wsum CPMpy expression + varmap (dict): dictionary mapping CPMpy variables to OPB variable names + """ + assert isinstance(cpm_expr, Operator) and cpm_expr.name == "wsum", f"Expected a wsum, but got {cpm_expr}" + weights, args = cpm_expr.args + + out = [] + for w, var in zip(weights, args): + var = varmap[var] if not isinstance(var, NegBoolView) else f"~{varmap[var._bv]}" + if w < 0: + out.append(f"- {w} {var}") + elif w > 0: + out.append(f"+ {w} {var}") + else: + pass # zero weight, ignore + + str_out = " ".join(out) + return str_out + +def _transform(cpm_expr, csemap, ivarmap, encoding="auto"): + """ + Transform a list of CPMpy expressions into a list of Pseudo-Boolean constraints. + """ + + cpm_cons = toplevel_list(cpm_expr) + cpm_cons = no_partial_functions(cpm_cons, safen_toplevel={"div", "mod", "element"}) + cpm_cons = decompose_in_tree(cpm_cons, + supported={"alldifferent"}, # alldiff has a specialized MIP decomp in linearize + csemap=csemap + ) + cpm_cons = simplify_boolean(cpm_cons) + cpm_cons = flatten_constraint(cpm_cons, csemap=csemap) # flat normal form + cpm_cons = only_bv_reifies(cpm_cons, csemap=csemap) + cpm_cons = only_implies(cpm_cons, csemap=csemap) + cpm_cons = linearize_constraint( + cpm_cons, supported=frozenset({"sum", "wsum"}), csemap=csemap + ) + cpm_cons = int2bool(cpm_cons, ivarmap, encoding=encoding) + + return _normalized_comparison(cpm_cons) + +def _transform_objective(expr, csemap, ivarmap, encoding="auto"): + """ + Transform a CPMpy objective expression into a weighted sum expression + """ + + # transform objective + obj, safe_cons = safen_objective(expr) + obj, decomp_cons = decompose_objective(obj, supported={"alldifferent"}, + csemap=csemap) + obj, flat_cons = flatten_objective(obj, csemap=csemap) + obj = only_positive_bv_wsum(obj) # remove negboolviews + + weights, xs, const = [], [], 0 + # we assume obj is a var, a sum or a wsum (over int and bool vars) + if isinstance(obj, _IntVarImpl) or isinstance(obj, NegBoolView): # includes _BoolVarImpl + weights = [1] + xs = [obj] + elif obj.name == "sum": + xs = obj.args + weights = [1] * len(xs) + elif obj.name == "wsum": + weights, xs = obj.args + else: + raise NotImplementedError(f"OPB: Non supported objective {obj} (yet?)") + + terms, cons, k = _encode_lin_expr(ivarmap, xs, weights, encoding) + + # remove terms with coefficient 0 (`only_positive_coefficients_` may return them and RC2 does not accept them) + terms = [(w, x) for w,x in terms if w != 0] + + obj = Operator("wsum", [[w for w,x in terms], [x for w,x in terms]]) + return obj, const, safe_cons + decomp_cons + flat_cons + def main(): parser = argparse.ArgumentParser(description="Parse and solve an OPB model using CPMpy") diff --git a/cpmpy/tools/io/reader.py b/cpmpy/tools/io/reader.py index 12a4c7c82..a3203a7cb 100644 --- a/cpmpy/tools/io/reader.py +++ b/cpmpy/tools/io/reader.py @@ -18,6 +18,7 @@ from cpmpy.tools.dimacs import read_dimacs from cpmpy.tools.io.scip import read_scip from cpmpy.tools.io.wcnf import read_wcnf +from cpmpy.tools.io.opb import read_opb from cpmpy.tools.io.utils import get_format # mapping format names to appropriate reader functions @@ -29,6 +30,7 @@ "gms": read_scip, "pip": read_scip, "dimacs": read_dimacs, + "opb": read_opb, "wcnf": read_wcnf, } diff --git a/cpmpy/tools/io/utils.py b/cpmpy/tools/io/utils.py index 85a1ca9fa..9dc7bd1e0 100644 --- a/cpmpy/tools/io/utils.py +++ b/cpmpy/tools/io/utils.py @@ -11,6 +11,7 @@ "pip" : "pip", "wcnf" : "wcnf", "cnf" : "dimacs", + "opb" : "opb", } _extension_map = {} diff --git a/cpmpy/tools/io/writer.py b/cpmpy/tools/io/writer.py index baa32f3d9..c4074e22d 100644 --- a/cpmpy/tools/io/writer.py +++ b/cpmpy/tools/io/writer.py @@ -23,6 +23,7 @@ import cpmpy as cp from cpmpy.tools.dimacs import write_dimacs from cpmpy.tools.io.scip import write_scip +from cpmpy.tools.io.opb import write_opb # mapping format names to appropriate writer functions _writer_map = { @@ -40,6 +41,7 @@ # "wbo": partial(write_scip, format="wbo"), # requires SIMPL, not included in pip package # "zpl": partial(write_scip, format="zpl"), # requires SIMPL, not included in pip package "dimacs": write_dimacs, + "opb": write_opb, # "wcnf": write_wcnf, # currently not supported } diff --git a/cpmpy/tools/opb/writer.py b/cpmpy/tools/opb/writer.py new file mode 100644 index 000000000..4b739da5f --- /dev/null +++ b/cpmpy/tools/opb/writer.py @@ -0,0 +1,239 @@ +""" + This file implements helper functions for exporting CPMpy models from and to OPB format. + OPB is a textual format to represent Pseudo-Boolean problems. + The header of the file is formatted as ``* #variable= #constraint= ``. + If the number of variables and constraints are not given, it is inferred by the writer. + + Each remaining line of the file is formatted as a constraint. + A constraint is formatted as a string of integers. + An integer represents a Boolean variable and a negative Boolean variable is represented using a `'-'` sign. +""" + +import cpmpy as cp + +from cpmpy.transformations.normalize import toplevel_list,simplify_boolean +from cpmpy.transformations.safening import no_partial_functions, safen_objective +from cpmpy.transformations.decompose_global import decompose_in_tree, decompose_objective +from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective +from cpmpy.transformations.reification import only_implies, only_bv_reifies +from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv_wsum +from cpmpy.transformations.int2bool import int2bool, _encode_lin_expr +from cpmpy.transformations.get_variables import get_variables +from cpmpy.expressions.variables import _IntVarImpl, NegBoolView, _BoolVarImpl +from cpmpy.expressions.core import Operator, Comparison + +from cpmpy import __version__ + +def write_opb(model, fname=None, encoding="auto"): + """ + Export a CPMpy model to the OPB (Pseudo-Boolean) format. + + This function transforms the given CPMpy model into OPB format, which is a standard textual + format for representing Pseudo-Boolean optimization problems. The OPB file will contain + a header specifying the number of variables and constraints, the objective (optional), and the + list of constraints using integer-weighted Boolean variables. + + Args: + model (cp.Model): The CPMpy model to export. + fname (str, optional): The file name to write the OPB output to. If None, the OPB string is returned. + encoding (str, optional): The encoding used for `int2bool`. Options: ("auto", "direct", "order", "binary"). + + Returns: + str or None: The OPB string if `fname` is None, otherwise nothing (writes to file). + + Format: + * #variable= #constraint= + * OPB file generated by CPMpy version + min/max: ; + ; + ; + ... + + Note: + Some solvers only support variable names of the form x. The OPB writer will remap + all CPMpy variables to such a format internally. + + Example: + >>> from cpmpy import * + >>> x = boolvar(shape=3) + >>> m = Model(x[0] + x[1] + x[2] >= 2) + >>> print(write_opb(m)) + """ + + csemap, ivarmap = dict(), dict() + opb_cons = _transform(model.constraints, csemap, ivarmap, encoding) + + if model.objective_ is not None: + opb_obj, const, extra_cons = _transform_objective(model.objective_, csemap, ivarmap, encoding) + opb_cons += extra_cons + else: + opb_obj = None + + # Form header and variable mapping + # Use all variables occurring in constraints and the objective + all_vars = get_variables(opb_cons + ([opb_obj] if opb_obj is not None else [])) + out = [ + f"* #variable= {len(all_vars)} #constraint= {len(opb_cons)}", + f"* OPB file generated by CPMpy version {__version__}", + ] + # Remap variables to 'x1', 'x2', ..., the standard OPB way + varmap = {v: f"x{i+1}" for i, v in enumerate(all_vars)} + + # Write objective, if present + if model.objective_ is not None: + objective_str = _wsum_to_str(opb_obj, varmap) + out.append(f"{'min' if model.objective_is_min else 'max'}: {objective_str};") + + # Write constraints + for cons in opb_cons: + assert isinstance(cons, Comparison), f"Expected a comparison, but got {cons}" + lhs, rhs = cons.args + constraint_str = f"{_wsum_to_str(lhs, varmap)} {cons.name} {rhs};" + out.append(constraint_str) + + # Output to file or string + contents = "\n".join(out) + if fname is None: + return contents + else: + with open(fname, "w") as f: + f.write(contents) + +def _normalized_comparison(lst_of_expr): + """ + Convert a list of linear CPMpy expressions into OPB-compatible pseudo-Boolean constraints. + + Transforms a list of Boolean-linear CPMpy expressions (as output by `linearize_constraint`) into a list + of OPB-normalized constraints, expressed as comparisons between weighted Boolean sums + (using "wsum") and integer constants. Handles Boolean vars, reifications, implications, + and ensures all equalities are decomposed into two inequalities. + + Args: + lst_of_expr (list): List of CPMpy Boolean-linear expressions. + + Returns: + list: List of normalized CPMpy `Comparison` objects representing pseudo-Boolean constraints. + """ + newlist = [] + for cpm_expr in lst_of_expr: + if isinstance(cpm_expr, cp.BoolVal) and cpm_expr.value() is False: + raise NotImplementedError(f"Cannot transform {cpm_expr} to OPB constraint") + + # single Boolean variable + if isinstance(cpm_expr, _BoolVarImpl): + cpm_expr = Operator("sum", [cpm_expr]) >= 1 + + # implication + if isinstance(cpm_expr, Operator) and cpm_expr.name == "->": + bv, subexpr = cpm_expr.args + assert isinstance(subexpr, _BoolVarImpl), "Only bv -> bv should reach here, but got {subexpr}" + cpm_expr = Operator("wsum", [[-1, 1], [bv, subexpr]]) >= 0 + newlist.append(cpm_expr) + continue + + # Comparison, can be single Boolean variable or (weighted) sum of Boolean variables + if isinstance(cpm_expr, Comparison): + lhs, rhs = cpm_expr.args + + if isinstance(lhs, _BoolVarImpl): + lhs = Operator("sum", [lhs]) + if lhs.name == "sum": + lhs = Operator("wsum", [[1]*len(lhs.args), lhs.args]) + + assert isinstance(lhs, Operator) and lhs.name == "wsum", f"Expected a wsum, but got {lhs}" + + # convert comparisons into >= constraints + if cpm_expr.name == "==": + newlist += _normalized_comparison([lhs <= rhs]) + newlist += _normalized_comparison([lhs >= rhs]) + elif cpm_expr.name == ">=": + newlist.append(lhs >= rhs) + elif cpm_expr.name == "<=": + new_weights = [-w for w in lhs.args[0]] + newlist.append(Operator("wsum", [new_weights, lhs.args[1]]) >= -rhs) + else: + raise ValueError(f"Unknown comparison {cpm_expr.name}") + else: + raise NotImplementedError(f"Expected a comparison, but got {cpm_expr}") + + return newlist + +def _wsum_to_str(cpm_expr, varmap): + """ + Convert a weighted sum CPMpy expression to a string in OPB format. + + args: + cpm_expr (Operator): wsum CPMpy expression + varmap (dict): dictionary mapping CPMpy variables to OPB variable names + """ + assert isinstance(cpm_expr, Operator) and cpm_expr.name == "wsum", f"Expected a wsum, but got {cpm_expr}" + weights, args = cpm_expr.args + + out = [] + for w, var in zip(weights, args): + var = varmap[var] if not isinstance(var, NegBoolView) else f"~{varmap[var._bv]}" + if w < 0: + out.append(f"- {w} {var}") + elif w > 0: + out.append(f"+ {w} {var}") + else: + pass # zero weight, ignore + + str_out = " ".join(out) + return str_out + +def _transform(cpm_expr, csemap, ivarmap, encoding="auto"): + """ + Transform a list of CPMpy expressions into a list of Pseudo-Boolean constraints. + """ + + cpm_cons = toplevel_list(cpm_expr) + cpm_cons = no_partial_functions(cpm_cons, safen_toplevel={"div", "mod", "element"}) + cpm_cons = decompose_in_tree(cpm_cons, + supported={"alldifferent"}, # alldiff has a specialized MIP decomp in linearize + csemap=csemap + ) + cpm_cons = simplify_boolean(cpm_cons) + cpm_cons = flatten_constraint(cpm_cons, csemap=csemap) # flat normal form + cpm_cons = only_bv_reifies(cpm_cons, csemap=csemap) + cpm_cons = only_implies(cpm_cons, csemap=csemap) + cpm_cons = linearize_constraint( + cpm_cons, supported=frozenset({"sum", "wsum"}), csemap=csemap + ) + cpm_cons = int2bool(cpm_cons, ivarmap, encoding=encoding) + + return _normalized_comparison(cpm_cons) + +def _transform_objective(expr, csemap, ivarmap, encoding="auto"): + """ + Transform a CPMpy objective expression into a weighted sum expression + """ + + # transform objective + obj, safe_cons = safen_objective(expr) + obj, decomp_cons = decompose_objective(obj, supported={"alldifferent"}, + csemap=csemap) + obj, flat_cons = flatten_objective(obj, csemap=csemap) + obj = only_positive_bv_wsum(obj) # remove negboolviews + + weights, xs, const = [], [], 0 + # we assume obj is a var, a sum or a wsum (over int and bool vars) + if isinstance(obj, _IntVarImpl) or isinstance(obj, NegBoolView): # includes _BoolVarImpl + weights = [1] + xs = [obj] + elif obj.name == "sum": + xs = obj.args + weights = [1] * len(xs) + elif obj.name == "wsum": + weights, xs = obj.args + else: + raise NotImplementedError(f"OPB: Non supported objective {obj} (yet?)") + + terms, cons, k = _encode_lin_expr(ivarmap, xs, weights, encoding) + + # remove terms with coefficient 0 (`only_positive_coefficients_` may return them and RC2 does not accept them) + terms = [(w, x) for w,x in terms if w != 0] + + obj = Operator("wsum", [[w for w,x in terms], [x for w,x in terms]]) + return obj, const, safe_cons + decomp_cons + flat_cons + From 52e905d454a167716d7ba77dd7d63c554edad67b Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 30 Jan 2026 14:07:39 +0100 Subject: [PATCH 089/152] add writer to module --- cpmpy/tools/io/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpmpy/tools/io/__init__.py b/cpmpy/tools/io/__init__.py index b1ede66e4..4b96849a8 100644 --- a/cpmpy/tools/io/__init__.py +++ b/cpmpy/tools/io/__init__.py @@ -18,6 +18,6 @@ from .rcpsp import read_rcpsp # Model datasets -from .opb import read_opb +from .opb import read_opb, write_opb from .scip import read_scip, write_scip from .wcnf import read_wcnf \ No newline at end of file From 52a3948f118970da4469bab42fabb58427f840f5 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 30 Jan 2026 14:09:30 +0100 Subject: [PATCH 090/152] small change to docstring --- cpmpy/tools/io/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpmpy/tools/io/__init__.py b/cpmpy/tools/io/__init__.py index 4b96849a8..19ae87d65 100644 --- a/cpmpy/tools/io/__init__.py +++ b/cpmpy/tools/io/__init__.py @@ -5,7 +5,7 @@ Use the generic `read(..., format="...")` and `write(..., format="...")` functions to read and write models in one of the supported formats. -Some formats can be auto-detected from the file extension. +Some formats can be auto-detected from the file extension, so only a file path is required as argument. """ from .writer import write, write_formats From 800570021bac9f7a5859a456b707dec442741f3e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 4 Feb 2026 12:01:13 +0100 Subject: [PATCH 091/152] update io --- cpmpy/tools/benchmark/opb.py | 2 +- cpmpy/tools/io/opb.py | 2 +- cpmpy/tools/io/scip.py | 62 ++++++++++++++++++------------------ 3 files changed, 33 insertions(+), 33 deletions(-) diff --git a/cpmpy/tools/benchmark/opb.py b/cpmpy/tools/benchmark/opb.py index 0c571a058..25abb3370 100644 --- a/cpmpy/tools/benchmark/opb.py +++ b/cpmpy/tools/benchmark/opb.py @@ -53,7 +53,7 @@ # CPMpy from cpmpy.tools.benchmark.runner import benchmark_runner from cpmpy.tools.benchmark._base import Benchmark -from cpmpy.tools.opb import read_opb +from cpmpy.tools.io.opb import read_opb from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus diff --git a/cpmpy/tools/io/opb.py b/cpmpy/tools/io/opb.py index 949ad4f44..ef32be718 100644 --- a/cpmpy/tools/io/opb.py +++ b/cpmpy/tools/io/opb.py @@ -37,7 +37,7 @@ from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective from cpmpy.transformations.reification import only_implies, only_bv_reifies from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv_wsum -from cpmpy.transformations.int2bool import int2bool, _encode_lin_expr +from cpmpy.transformations.int2bool import int2bool from cpmpy.transformations.get_variables import get_variables from cpmpy.expressions.variables import _IntVarImpl, NegBoolView, _BoolVarImpl from cpmpy.expressions.core import Operator, Comparison diff --git a/cpmpy/tools/io/scip.py b/cpmpy/tools/io/scip.py index 3ea825c52..1f32dc6e5 100644 --- a/cpmpy/tools/io/scip.py +++ b/cpmpy/tools/io/scip.py @@ -52,7 +52,7 @@ from cpmpy.transformations.reification import only_implies, reify_rewrite from cpmpy.expressions.utils import is_any_list, is_num from cpmpy.expressions.globalconstraints import DirectConstraint -from cpmpy.expressions.variables import ignore_variable_name_check +# from cpmpy.expressions.variables import ignore_variable_name_check _std_open = open @@ -71,39 +71,39 @@ def read_scip(fname: Union[str, os.PathLike], open=open, assume_integer:bool=Fal if not _SCIPWriter.supported(): raise Exception("SCIP: Install SCIP IO dependencies: cpmpy[io.scip]") - with ignore_variable_name_check(): + # with ignore_variable_name_check(): - from pyscipopt import Model - - # Load file into pyscipopt model - scip = Model() - scip.hideOutput() - scip.readProblem(filename=fname) - scip.hideOutput(quiet=False) - - # 1) translate variables - scip_vars = scip.getVars() - var_map = {} - for var in scip_vars: - name = var.name # name of the variable - vtype = var.vtype() # type of the variable - if vtype == "BINARY": - var_map[name] = cp.boolvar(name=name) - elif vtype == "INTEGER": - lb = int(var.getLbOriginal()) - ub = int(var.getUbOriginal()) + from pyscipopt import Model + + # Load file into pyscipopt model + scip = Model() + scip.hideOutput() + scip.readProblem(filename=fname) + scip.hideOutput(quiet=False) + + # 1) translate variables + scip_vars = scip.getVars() + var_map = {} + for var in scip_vars: + name = var.name # name of the variable + vtype = var.vtype() # type of the variable + if vtype == "BINARY": + var_map[name] = cp.boolvar(name=name) + elif vtype == "INTEGER": + lb = int(var.getLbOriginal()) + ub = int(var.getUbOriginal()) + var_map[name] = cp.intvar(lb, ub, name=name) + elif vtype == "CONTINUOUS": + if assume_integer: + lb = int(math.ceil(var.getLbOriginal())) + ub = int(math.floor(var.getUbOriginal())) + if lb != var.getLbOriginal() or ub != var.getUbOriginal(): + warnings.warn(f"Continuous variable {name} has non-integer bounds {var.getLbOriginal()} - {var.getUbOriginal()}. CPMpy will assume it is integer.") var_map[name] = cp.intvar(lb, ub, name=name) - elif vtype == "CONTINUOUS": - if assume_integer: - lb = int(math.ceil(var.getLbOriginal())) - ub = int(math.floor(var.getUbOriginal())) - if lb != var.getLbOriginal() or ub != var.getUbOriginal(): - warnings.warn(f"Continuous variable {name} has non-integer bounds {var.getLbOriginal()} - {var.getUbOriginal()}. CPMpy will assume it is integer.") - var_map[name] = cp.intvar(lb, ub, name=name) - else: - raise ValueError(f"CPMpy does not support continious variables: {name}") else: - raise ValueError(f"Unsupported variable type: {vtype}") + raise ValueError(f"CPMpy does not support continious variables: {name}") + else: + raise ValueError(f"Unsupported variable type: {vtype}") model = cp.Model() From 8e7fb1c099b3873c61d12a7ed06efa9364bad9d2 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 4 Feb 2026 12:01:25 +0100 Subject: [PATCH 092/152] update base --- cpmpy/tools/dataset/_base.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index 3f4482644..3bf91076d 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -153,8 +153,10 @@ def _download_file(url: str, target: str, destination: Optional[str] = None, req = Request(url + target) with urlopen(req) as response: total_size = int(response.headers.get('Content-Length', 0)) - - _Dataset._download_sequential(url + target, destination if destination is not None else temp_destination.name, total_size, desc, chunk_size) + + # Convert destination to Path for _download_sequential + download_path = pathlib.Path(destination) if destination is not None else pathlib.Path(temp_destination.name) + _Dataset._download_sequential(url + target, download_path, total_size, desc, chunk_size) if destination is None: temp_destination.close() @@ -169,6 +171,14 @@ def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc chunk_size: int = 1024 * 1024): """Download file sequentially (fallback method).""" import sys + import os + + # Convert to Path if it's a string + if isinstance(filepath, str): + filepath = pathlib.Path(filepath) + + # Ensure parent directory exists + filepath.parent.mkdir(parents=True, exist_ok=True) req = Request(url) with urlopen(req) as response: From 67643b3bd49a8b2b009bb11e2e3ddeb404688296 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 4 Feb 2026 12:19:55 +0100 Subject: [PATCH 093/152] add observers --- cpmpy/tools/benchmark/test/observer.py | 664 +++++++++++++++++++++++++ 1 file changed, 664 insertions(+) create mode 100644 cpmpy/tools/benchmark/test/observer.py diff --git a/cpmpy/tools/benchmark/test/observer.py b/cpmpy/tools/benchmark/test/observer.py new file mode 100644 index 000000000..2bf7e6b79 --- /dev/null +++ b/cpmpy/tools/benchmark/test/observer.py @@ -0,0 +1,664 @@ +from abc import ABC + +import logging +import signal +import sys +import warnings +import os +import time +from typing import Optional +import contextlib +import cpmpy as cp +from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus +from cpmpy.tools.benchmark.opb import solution_opb +from cpmpy.tools.benchmark import set_memory_limit, set_time_limit, _bytes_as_mb, _bytes_as_gb, _mib_as_bytes + +from .runner import Runner + + +class Observer(ABC): + + def __init__(self, **kwargs): + self.kwargs = kwargs + + def observe_init(self, runner: Runner): + pass + + def observe_pre_transform(self, runner: Runner): + pass + + def observe_post_transform(self, runner: Runner): + pass + + def observe_pre_solve(self, runner: Runner): + pass + + def observe_post_solve(self, runner: Runner): + pass + + def participate_solver_args(self, runner: Runner, solver_args: dict): + return solver_args + + def observe_exception(self, runner: Runner, exc_type, exc_value, traceback): + """ + Called when an exception occurs in the context. + + Returns: + True if the exception should be suppressed, False/None to propagate. + """ + pass + + def observe_exit(self, runner: Runner): + pass + + def observe_end(self, runner: Runner): + pass + + def print_comment(self, comment: str): + pass + + def observe_intermediate(self, runner: Runner, objective: int): + pass + + def get_context_manager(self, runner: Runner): + """ + Return a context manager that will be entered when the ObserverContext is entered. + Return None if this observer doesn't provide a context manager. + """ + return None + + +# ---------------------------------------------------------------------------- # +# Collection of pre-made observers: # +# ---------------------------------------------------------------------------- # + + +class HandlerObserver(Observer): + + def __init__(self, **kwargs): + self.runner = None + + def observe_init(self, runner: Runner): + self.runner = runner + signal.signal(signal.SIGINT, self._sigterm_handler) + signal.signal(signal.SIGTERM, self._sigterm_handler) + signal.signal(signal.SIGINT, self._sigterm_handler) + signal.signal(signal.SIGABRT, self._sigterm_handler) + if sys.platform != "win32": + signal.signal(signal.SIGXCPU, self._rlimit_cpu_handler) + else: + warnings.warn("Windows does not support setting SIGXCPU signal") + + def _sigterm_handler(self, _signo, _stack_frame): + exit_code = self.handle_sigterm() + print(flush=True) + os._exit(exit_code) + + def _rlimit_cpu_handler(self, _signo, _stack_frame): + # Raise TimeoutError - ObserverContext will handle notifying observers + # Don't notify here to avoid duplicates + raise TimeoutError("CPU time limit reached (SIGXCPU)") + + def handle_sigterm(self): + return 0 + + def handle_rlimit_cpu(self): + return 0 + + +class LoggerObserver(Observer): + def __init__(self, **kwargs): + # Use a unique logger name for this observer instance + self.logger = logging.getLogger(f"{__name__}.LoggerObserver") + # Set level to INFO to ensure messages are logged + self.logger.setLevel(logging.INFO) + # Disable propagation to root logger to avoid duplicate messages + self.logger.propagate = False + # Store reference to original stdout to always print there, even if redirected + self.original_stdout = sys.__stdout__ + # Always add a new handler to ensure it writes to original stdout + # Remove existing handlers first to avoid duplicates + self.logger.handlers.clear() + handler = logging.StreamHandler(self.original_stdout) + handler.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + self.logger.addHandler(handler) + # Force the logger to be effective at INFO level + self.logger.disabled = False + + def observe_init(self, runner: Runner): + self.logger.info("Initializing runner") + + def observe_pre_transform(self, runner: Runner): + self.logger.info("Pre-transforming") + + def observe_post_transform(self, runner: Runner): + self.logger.info("Post-transforming") + + def observe_pre_solve(self, runner: Runner): + self.logger.info("Pre-solving") + + def observe_post_solve(self, runner: Runner): + self.logger.info("Post-solving") + + def print_comment(self, comment: str): + # Use info level to log comments + self.logger.info(comment) + # Also ensure it's flushed immediately + for handler in self.logger.handlers: + handler.flush() + + +class CompetitionPrintingObserver(Observer): + + def __init__(self, verbose: bool = False, **kwargs): + self.verbose = verbose + + def print_comment(self, comment: str): + # Comment is already formatted by Runner.print_comment() before being passed to observers + # So just print it as-is + print(comment.rstrip('\n'), end="\r\n", flush=True) + + def observe_post_solve(self, runner: Runner): + self.print_result(runner.s) + + def observe_intermediate(self, objective: int): + self.print_intermediate(objective) + + def print_status(self, status: str): + print('s' + chr(32) + status, end="\n", flush=True) + + def print_value(self, value: str): + print('v' + chr(32) + value, end="\n", flush=True) + + def print_objective(self, objective: int): + print('o' + chr(32) + str(objective), end="\n", flush=True) + + def print_intermediate(self, objective: int): + self.print_objective(objective) + + def print_result(self, s): + if s.status().exitstatus == CPMStatus.OPTIMAL: + self.print_objective(s.objective_value()) + self.print_value(solution_opb(s)) + self.print_status("OPTIMAL" + chr(32) + "FOUND") + elif s.status().exitstatus == CPMStatus.FEASIBLE: + self.print_objective(s.objective_value()) + self.print_value(solution_opb(s)) + self.print_status("SATISFIABLE") + elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: + self.print_status("UNSATISFIABLE") + else: + self.print_comment("Solver did not find any solution within the time/memory limit") + self.print_status("UNKNOWN") + + +class ResourceLimitObserver(Observer): + def __init__(self, time_limit: Optional[int] = None, mem_limit: Optional[int] = None, **kwargs): + self.time_limit = time_limit + self.mem_limit = mem_limit + + def observe_init(self, runner: Runner): + if self.time_limit is not None: + set_time_limit(self.time_limit) + if self.mem_limit is not None: + set_memory_limit(self.mem_limit) + + def _handle_memory_error(self, runner: Runner, mem_limit: int): + runner.print_comment(f"MemoryError raised. Reached limit of {mem_limit} MiB") + + def _handle_timeout(self, runner: Runner, time_limit: int): + if time_limit is not None: + runner.print_comment(f"TimeoutError raised. Reached limit of {time_limit} seconds") + else: + runner.print_comment(f"TimeoutError raised. CPU time limit reached") + + def observe_exception(self, runner: Runner, exc_type, exc_value, traceback): + """ + Handle exceptions related to resource limits. + Returns True to suppress the exception after handling. + """ + if exc_type is MemoryError: + # Only handle if we have a memory limit set + if self.mem_limit is not None: + self._handle_memory_error(runner=runner, mem_limit=self.mem_limit) + return True # Suppress the exception after handling + elif exc_type is TimeoutError: + # Only handle if we have a time limit set + if self.time_limit is not None: + self._handle_timeout(runner=runner, time_limit=self.time_limit) + return True # Suppress the exception after handling + return False # Don't suppress other exceptions + + +class SolverArgsObserver(Observer): + + def __init__(self, **kwargs): + self.time_limit = None + self.mem_limit = None + self.seed = None + self.intermediate = False + self.cores = 1 + self.mem_limit = None + self.kwargs = dict() + + def observe_init(self, runner: Runner): + self.time_limit = runner.time_limit + self.mem_limit = runner.mem_limit + self.seed = runner.seed + self.intermediate = runner.intermediate + self.cores = runner.cores + self.mem_limit = runner.mem_limit + self.kwargs = runner.kwargs + + def _ortools_arguments( + self, + runner: Runner, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + # https://github.com/google/or-tools/blob/stable/ortools/sat/sat_parameters.proto + res = dict() + + # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 + res |= { + "interleave_search": True, + "use_rins_lns": False, + } + if not model.has_objective(): + res |= { "num_violation_ls": 1 } + + if cores is not None: + res |= { "num_search_workers": cores } + if seed is not None: + res |= { "random_seed": seed } + + if intermediate and model.has_objective(): + # Define custom ORT solution callback, then register it + _self = self + from ortools.sat.python import cp_model as ort + class OrtSolutionCallback(ort.CpSolverSolutionCallback): + """ + For intermediate objective printing. + """ + + def __init__(self): + super().__init__() + self.__start_time = time.time() + self.__solution_count = 1 + + def on_solution_callback(self): + """Called on each new solution.""" + + current_time = time.time() + obj = int(self.ObjectiveValue()) + _self.print_comment('Solution %i, time = %0.4fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.observe_intermediate(runner=runner, objective=obj) + self.__solution_count += 1 + + + def solution_count(self): + """Returns the number of solutions found.""" + return self.__solution_count + + # Register the callback + res |= { "solution_callback": OrtSolutionCallback() } + + def internal_options(solver: "CPM_ortools"): + # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 + solver.ort_solver.parameters.subsolvers.extend(["default_lp", "max_lp", "quick_restart"]) + if not model.has_objective(): + solver.ort_solver.parameters.subsolvers.append("core_or_no_lp") + if len(solver.ort_model.proto.search_strategy) != 0: + solver.ort_solver.parameters.subsolvers.append("fixed") + + return res, internal_options + + def _exact_arguments( + self, + seed: Optional[int] = None, + **kwargs + ): + # Documentation: https://gitlab.com/JoD/exact/-/blob/main/src/Options.hpp?ref_type=heads + res = dict() + if seed is not None: + res |= { "seed": seed } + + return res, None + + def _choco_arguments(self): + # Documentation: https://github.com/chocoteam/pychoco/blob/master/pychoco/solver.py + return {}, None + + def _z3_arguments( + self, + model: cp.Model, + cores: int = 1, + seed: Optional[int] = None, + mem_limit: Optional[int] = None, + **kwargs + ): + # Documentation: https://microsoft.github.io/z3guide/programming/Parameters/ + # -> is outdated, just let it crash and z3 will report the available options + + res = dict() + + if model.has_objective(): + # Opt does not seem to support setting random seed or max memory + pass + else: + # Sat parameters + if cores is not None: + res |= { "threads": cores } # TODO what with hyperthreadding, when more threads than cores + if seed is not None: + res |= { "random_seed": seed } + if mem_limit is not None: + res |= { "max_memory": _bytes_as_mb(mem_limit) } + + return res, None + + def _minizinc_arguments( + self, + solver: str, + cores: Optional[int] = None, + seed: Optional[int] = None, + **kwargs + ): + # Documentation: https://minizinc-python.readthedocs.io/en/latest/api.html#minizinc.instance.Instance.solve + res = dict() + if cores is not None: + res |= { "processes": cores } + if seed is not None: + res |= { "random_seed": seed } + + #if solver.endswith("gecode"): + # Documentation: https://www.minizinc.org/doc-2.4.3/en/lib-gecode.html + #elif solver.endswith("chuffed"): + # Documentation: + # - https://www.minizinc.org/doc-2.5.5/en/lib-chuffed.html + # - https://github.com/chuffed/chuffed/blob/develop/chuffed/core/options.h + + return res, None + + def _gurobi_arguments( + self, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + mem_limit: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + # Documentation: https://www.gurobi.com/documentation/9.5/refman/parameters.html#sec:Parameters + res = dict() + if cores is not None: + res |= { "Threads": cores } + if seed is not None: + res |= { "Seed": seed } + if mem_limit is not None: + res |= { "MemLimit": _bytes_as_gb(mem_limit) } + + if intermediate and model.has_objective(): + + _self = self + + class GurobiSolutionCallback: + def __init__(self, model:cp.Model): + self.__start_time = time.time() + self.__solution_count = 0 + self.model = model + + def callback(self, *args, **kwargs): + current_time = time.time() + model, state = args + + # Callback codes: https://www.gurobi.com/documentation/current/refman/cb_codes.html#sec:CallbackCodes + + from gurobipy import GRB + # if state == GRB.Callback.MESSAGE: # verbose logging + # print_comment("log message: " + str(model.cbGet(GRB.Callback.MSG_STRING))) + if state == GRB.Callback.MIP: # callback from the MIP solver + if model.cbGet(GRB.Callback.MIP_SOLCNT) > self.__solution_count: # do we have a new solution? + + obj = int(model.cbGet(GRB.Callback.MIP_OBJBST)) + _self.print_comment('Solution %i, time = %0.4fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.print_intermediate(obj) + self.__solution_count = model.cbGet(GRB.Callback.MIP_SOLCNT) + + res |= { "solution_callback": GurobiSolutionCallback(model).callback } + + return res, None + + def _cpo_arguments( + self, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + # Documentation: https://ibmdecisionoptimization.github.io/docplex-doc/cp/docplex.cp.parameters.py.html#docplex.cp.parameters.CpoParameters + res = dict() + if cores is not None: + res |= { "Workers": cores } + if seed is not None: + res |= { "RandomSeed": seed } + + if intermediate and model.has_objective(): + from docplex.cp.solver.solver_listener import CpoSolverListener + _self = self + class CpoSolutionCallback(CpoSolverListener): + + def __init__(self): + super().__init__() + self.__start_time = time.time() + self.__solution_count = 1 + + def result_found(self, solver, sres): + current_time = time.time() + obj = sres.get_objective_value() + if obj is not None: + _self.print_comment('Solution %i, time = %0.4fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.print_intermediate(obj) + self.__solution_count += 1 + + def solution_count(self): + """Returns the number of solutions found.""" + return self.__solution_count + + # Register the callback + res |= { "solution_callback": CpoSolutionCallback } + + return res, None + + def _cplex_arguments( + self, + cores: Optional[int] = None, + seed: Optional[int] = None, + **kwargs + ): + res = dict() + if cores is not None: + res |= {"threads": cores} + if seed is not None: + res |= {"randomseed": seed} + + return res, None + + def _hexaly_arguments( + self, + model: cp.Model, + cores: Optional[int] = None, + seed: Optional[int] = None, + intermediate: bool = False, + **kwargs + ): + res = dict() + #res |= {"nb_threads": cores} + #res |= {"seed": seed} + + + if intermediate and model.has_objective(): + # Define custom Hexaly solution callback, then register it + + _self = self + class HexSolutionCallback: + + def __init__(self): + self.__start_time = time.time() + self.__solution_count = 0 + + + def on_solution_callback(self, optimizer, cb_type): + """Called on each new solution.""" + # check if solution with different objective (or if verbose) + current_time = time.time() + obj = optimizer.model.objectives[0] + _self.print_comment('Solution %i, time = %0.4fs' % + (self.__solution_count, current_time - self.__start_time)) + _self.print_intermediate(obj) + self.__solution_count += 1 + + def solution_count(self): + return self.__solution_count + + # Register the callback + res |= { "solution_callback": HexSolutionCallback().on_solution_callback } + + return res, None + + def _solver_arguments( + self, + runner: Runner, + solver: str, + model: cp.Model, + seed: Optional[int] = None, + intermediate: bool = False, + cores: int = 1, + mem_limit: Optional[int] = None, + **kwargs + ): + opt = model.has_objective() + sat = not opt + + if solver == "ortools": + return self._ortools_arguments(runner, model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) + elif solver == "exact": + return self._exact_arguments(seed=seed, **kwargs) + elif solver == "choco": + return self._choco_arguments() + elif solver == "z3": + return self._z3_arguments(model, cores=cores, seed=seed, mem_limit=mem_limit, **kwargs) + elif solver.startswith("minizinc"): # also can have a subsolver + return self._minizinc_arguments(solver, cores=cores, seed=seed, **kwargs) + elif solver == "gurobi": + return self._gurobi_arguments(model, cores=cores, seed=seed, mem_limit=mem_limit, intermediate=intermediate, opt=opt, **kwargs) + elif solver == "cpo": + return self._cpo_arguments(model=model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) + elif solver == "hexaly": + return self._hexaly_arguments(model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) + elif solver == "cplex": + return self._cplex_arguments(cores=cores, **kwargs) + else: + runner.print_comment(f"setting parameters of {solver} is not (yet) supported") + return dict(), None + + def participate_solver_args(self, runner: Runner, solver_args: dict): + args, internal_options = self._solver_arguments(runner, runner.solver, model=runner.model, seed=self.seed, + intermediate=self.intermediate, + cores=self.cores, mem_limit=_mib_as_bytes(self.mem_limit) if self.mem_limit is not None else None, + **self.kwargs) + + if internal_options is not None: + internal_options(runner.s) + solver_args |= args + runner.print_comment(f"Solver arguments: {args}") + + +class RuntimeObserver(Observer): + + def __init__(self, **kwargs): + self.start_time = None + self.end_time = None + self.start_transform_time = None + self.end_transform_time = None + + def observe_init(self, runner: Runner): + self.start_time = time.time() + + def observe_pre_transform(self, runner: Runner): + self.start_transform_time = time.time() + + def observe_post_transform(self, runner: Runner): + self.end_transform_time = time.time() + runner.print_comment(f"Time taken to transform: {self.end_transform_time - self.start_transform_time} seconds") + + def observe_post_solve(self, runner: Runner): + runner.print_comment(f"Time taken to solve: {runner.s.status().runtime} seconds") + + def observe_end(self, runner: Runner): + runner.print_comment(f"Total time taken: {time.time() - self.start_time} seconds") + + +class SolutionCheckerObserver(Observer): + + def observe_end(self, runner: Runner): + runner.print_comment(f"Run solution checker here...") + + +class WriteToFileObserver(Observer): + def __init__(self, output_file: str, overwrite: bool = True, **kwargs): + self.file_path = output_file + self.file_handle = None + self.context_active = False + self.overwrite = overwrite + self.file_opened = False # Track if file has been opened in write mode + + def get_context_manager(self, runner: Runner): + """Return a context manager that redirects stdout to a file.""" + @contextlib.contextmanager + def redirect_to_file(): + # If overwrite and file hasn't been opened yet, open in write mode + # Otherwise, append to preserve existing content + mode = 'w' if (self.overwrite and not self.file_opened) else 'a' + with open(self.file_path, mode) as f: + self.file_handle = f + self.context_active = True + self.file_opened = True + with contextlib.redirect_stdout(f): + yield + self.context_active = False + self.file_handle = None + return redirect_to_file() + + def print_comment(self, comment: str, runner: 'Runner' = None): + """Write comments to the file using the print_comment hook (in addition to stdout).""" + # Comment is already formatted by Runner.print_comment() before being passed to observers + formatted_comment = comment.rstrip('\n\r') + + if self.context_active and self.file_handle is not None: + # Context is active, write directly to the file handle + self.file_handle.write(formatted_comment + '\r\n') + self.file_handle.flush() + else: + # Context not active yet or has exited + # If overwrite and file hasn't been opened, open in write mode to truncate + # Otherwise, append to preserve existing content + if self.overwrite and not self.file_opened: + mode = 'w' + self.file_opened = True + else: + mode = 'a' + with open(self.file_path, mode) as f: + f.write(formatted_comment + '\r\n') + + def observe_init(self, runner: Runner): + """Store reference to runner so we can access instance_runner.""" + self._runner = runner \ No newline at end of file From dd53ae03a4404f2d34bf32520f988a9ff684554a Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 4 Feb 2026 12:21:39 +0100 Subject: [PATCH 094/152] Collection of other changes --- cpmpy/tools/benchmark/test/instance_runner.py | 118 +- cpmpy/tools/benchmark/test/manager.py | 119 +- cpmpy/tools/benchmark/test/run_benchmark.py | 1025 +++++++++++++++++ cpmpy/tools/benchmark/test/run_xcsp3.py | 58 - ...{bench_xcsp3.py => run_xcsp3_benchmark.py} | 6 +- .../benchmark/test/run_xcsp3_instance.py | 48 + cpmpy/tools/benchmark/test/runner.py | 890 ++------------ .../benchmark/test/xcsp3_instance_runner.py | 66 -- 8 files changed, 1411 insertions(+), 919 deletions(-) create mode 100644 cpmpy/tools/benchmark/test/run_benchmark.py delete mode 100644 cpmpy/tools/benchmark/test/run_xcsp3.py rename cpmpy/tools/benchmark/test/{bench_xcsp3.py => run_xcsp3_benchmark.py} (94%) create mode 100644 cpmpy/tools/benchmark/test/run_xcsp3_instance.py delete mode 100644 cpmpy/tools/benchmark/test/xcsp3_instance_runner.py diff --git a/cpmpy/tools/benchmark/test/instance_runner.py b/cpmpy/tools/benchmark/test/instance_runner.py index 3b1122851..a42baa055 100644 --- a/cpmpy/tools/benchmark/test/instance_runner.py +++ b/cpmpy/tools/benchmark/test/instance_runner.py @@ -1,20 +1,117 @@ import argparse -import lzma -from pathlib import Path +import inspect +import os +import sys from functools import partial +from pathlib import Path +from typing import Optional + +from cpmpy.tools.benchmark.test.runner import Runner -from cpmpy.tools.benchmark.test.runner import Runner, CompetitionPrintingObserver, ProfilingObserver, HandlerObserver, SolverArgsObserver, SolutionCheckerObserver, WriteToFileObserver -from cpmpy.tools.dataset.model.xcsp3 import XCSP3Dataset -from cpmpy.tools.xcsp3 import read_xcsp3 +def create_output_file(output_file: Optional[str], base_dir: Optional[str] = None, *args) -> str: + """ + Create an output file path. + + Args: + output_file: The output file path (can be relative or absolute) + base_dir: Base directory for output files (default: "results/") + *args: Additional arguments used to construct default filename if output_file is None + + Returns: + The full output file path + """ + if base_dir is None: + base_dir = "results" + + if output_file is None: + output_file = f"{'_'.join(args)}.txt" + + # If output_file is already absolute, use it as-is + if os.path.isabs(output_file): + full_path = output_file + else: + # Otherwise, join with base_dir + full_path = os.path.join(base_dir, output_file) + + Path(full_path).parent.mkdir(parents=True, exist_ok=True) + + return full_path class InstanceRunner: def __init__(self): self.additional_observers = [] + # Get the file path of the concrete class, not the base class + # This allows subclasses to reference their own file path + self.this_file_path = os.path.abspath(inspect.getfile(type(self))) + self.this_python = sys.executable + + def get_runner(self, instance: str, solver: str = "ortools", output_file: str = None, overwrite: bool = True, **kwargs): + + runner = Runner(reader=self.reader) + # Store reference to instance_runner so observers can access it for formatting + runner.instance_runner = self + + # Register default observers + import inspect as inspect_module + for observer in self.default_observers: + # Check if observer accepts output_file and overwrite parameters + sig = inspect_module.signature(observer.__init__) + if 'output_file' in sig.parameters or 'overwrite' in sig.parameters: + runner.register_observer(observer(output_file=output_file, overwrite=overwrite)) + else: + runner.register_observer(observer()) + + # Register any additional observers that were added programmatically + # Track file paths to avoid duplicate WriteToFileObserver registrations + registered_file_paths = set() + for observer in self.get_additional_observers(): + # If observer is a partial function, call it to get the instance + if isinstance(observer, partial): + obs_instance = observer() + # Check if it's a WriteToFileObserver and if we've already registered one for this file + if hasattr(obs_instance, 'file_path'): + if obs_instance.file_path in registered_file_paths: + continue # Skip duplicate WriteToFileObserver for the same file + registered_file_paths.add(obs_instance.file_path) + runner.register_observer(obs_instance) + # If observer is already an instance, use it directly + elif hasattr(observer, '__class__') and not inspect.isclass(observer): + # Check if it's a WriteToFileObserver and if we've already registered one for this file + if hasattr(observer, 'file_path'): + if observer.file_path in registered_file_paths: + continue # Skip duplicate WriteToFileObserver for the same file + registered_file_paths.add(observer.file_path) + runner.register_observer(observer) + # If observer is a class, instantiate it + else: + sig = inspect_module.signature(observer.__init__) + if 'output_file' in sig.parameters or 'overwrite' in sig.parameters: + obs_instance = observer(output_file=output_file, overwrite=overwrite) + # Check if it's a WriteToFileObserver and if we've already registered one for this file + if hasattr(obs_instance, 'file_path'): + if obs_instance.file_path in registered_file_paths: + continue # Skip duplicate WriteToFileObserver for the same file + registered_file_paths.add(obs_instance.file_path) + runner.register_observer(obs_instance) + else: + runner.register_observer(observer()) + + # Create output file path + output_file = create_output_file(output_file, None, solver, instance) + + return runner def cmd(self, instance: str): pass + def base_cmd(self, instance: str): + return [ + self.this_python, + self.this_file_path, + instance, + ] + def argparser(self): parser = argparse.ArgumentParser() parser.add_argument("instance", type=str) @@ -30,7 +127,9 @@ def argparser(self): return parser def print_comment(self, comment: str): - pass + """Print a comment. Subclasses can override to add formatting (e.g., 'c ' prefix).""" + # Default implementation: just print (subclasses can override to add formatting) + print(comment) def register_observer(self, observer): """Register an observer to be added when run() is called.""" @@ -40,3 +139,10 @@ def get_additional_observers(self): """Get the list of additional observers that should be registered.""" return self.additional_observers + def run(self, instance: str, solver: str = "ortools", output_file: str = None, **kwargs): + + + + + self.runner = self.get_runner(instance, solver, output_file, **kwargs) + self.runner.run(instance=instance, solver=solver, output_file=output_file, **kwargs) diff --git a/cpmpy/tools/benchmark/test/manager.py b/cpmpy/tools/benchmark/test/manager.py index 1ded83305..19d507444 100644 --- a/cpmpy/tools/benchmark/test/manager.py +++ b/cpmpy/tools/benchmark/test/manager.py @@ -9,16 +9,27 @@ import logging from pathlib import Path +from cpmpy.tools.benchmark import _mib_as_bytes from cpmpy.tools.benchmark.test.instance_runner import InstanceRunner -from cpmpy.tools.benchmark.test.xcsp3_instance_runner import XCSP3InstanceRunner -from cpmpy.tools.benchmark.test.runner import ResourceLimitObserver +from cpmpy.tools.benchmark.test.run_xcsp3_instance import XCSP3InstanceRunner +from cpmpy.tools.benchmark.test.observer import ResourceLimitObserver class ResourceManager: + """ + Abstract base class for resource managers. + + Manages the allocation of resources (time, memory, cores) to a single instance run. + Sets limits on the resources and handles callbacks when these limits are exceeded. + """ pass class RunExecResourceManager: - + """ + Resource manager that uses benchexec's RunExecutor for resource control (build on cgroups and kernel namespaces). + Requires `benchexec` to be installed. + """ + @contextlib.contextmanager def _print_forwarding_context(self, runner: InstanceRunner): """Context manager that forwards all print statements, warnings, and logging to runner.print_comment.""" @@ -210,10 +221,43 @@ def warning_handler(message, category, filename, lineno, file=None, line=None): stdout_forwarder.forward_to_runner() stderr_forwarder.forward_to_runner() - def run(self, instance: str, runner: InstanceRunner, time_limit: float, memory_limit: int, cores: list[int]): + def run(self, + instance: str, + runner: InstanceRunner, + time_limit: float, + memory_limit: int, + cores: list[int], + solver: str, + seed: int, + intermediate: bool, + verbose: bool, + output_file: str, + ) -> dict: + """ + Run a single instance with assigned resources. + + Arguments: + instance: Instance file path + runner: Instance runner + time_limit: Time limit in seconds + memory_limit: Memory limit in MB + cores: List of core IDs to assign to this run (e.g., [0, 1] for cores 0 and 1) - runner.print_comment(f"Running instance {instance} with time limit {time_limit} and memory limit {memory_limit} and cores {cores}") - runner.print_comment(f"Running with manager {self.__class__.__name__}") + runexec creates a new process and namespace for the instance run. So the benchmark needs to be run in a + separate process for runexec to be able to control the resources. + """ + + # Automatically add WriteToFileObserver if output_file is provided + if output_file is not None: + from functools import partial + from cpmpy.tools.benchmark.test.observer import WriteToFileObserver + runner.register_observer(partial(WriteToFileObserver, output_file=output_file, overwrite=True)) + + _runner = runner.get_runner(instance, solver, output_file, overwrite=True) + # Use runner's print_comment to go through the callback system (observers) + # The CompetitionPrintingObserver (in default_observers) will add the 'c ' prefix + _runner.print_comment(f"Running instance {instance} with time limit {time_limit} and memory limit {memory_limit} and cores {cores}") + _runner.print_comment(f"Running with manager {self.__class__.__name__}") from benchexec.runexecutor import RunExecutor @@ -256,7 +300,7 @@ def signal_handler_kill(signum, frame): # softtimelimit=options.softtimelimit, walltimelimit=time_limit, cores=cores, - memlimit=memory_limit, + memlimit=_mib_as_bytes(memory_limit), # memory_nodes=options.memoryNodes, # cgroupValues=cgroup_values, # workingDir=options.dir, @@ -285,9 +329,8 @@ def _is_runexec_message(line): line_stripped = line.strip() # Skip empty lines and RunExecutor messages if line_stripped and not _is_runexec_message(line_stripped): - # Subprocess output is already formatted by the runner's observers, - # so print it directly without wrapping in print_comment to avoid double-prefixing - print(line_stripped, flush=True) + # Forward subprocess output through runner so observers can capture it + _runner.print_comment(line_stripped) except FileNotFoundError: # Output file might not exist if process was killed before writing pass @@ -298,18 +341,51 @@ def _is_runexec_message(line): except Exception: pass - runner.print_comment(f"RunExec result: {result}") + _runner.print_comment(f"RunExec result: {result}") if "terminationreason" in result: reason = result["terminationreason"] if reason == "memory": - runner.print_comment("Memory limit exceeded") + _runner.print_comment("Memory limit exceeded") elif reason == "walltime": - runner.print_comment("Wall time limit exceeded") + _runner.print_comment("Wall time limit exceeded") class PythonResourceManager: - - def run(self, instance: str, runner: InstanceRunner, time_limit: int, memory_limit: int, cores: list[int]): + """ + Resource manager that uses Python's resource module for resource control. + """ + + def run(self, + instance: str, + runner: InstanceRunner, + time_limit: int, + memory_limit: int, + cores: list[int], + solver: str, + seed: int, + intermediate: bool, + verbose: bool, + output_file: str, + ) -> dict: + """ + Run a single instance with assigned resources. + + Arguments: + instance: Instance file path + runner: Instance runner + time_limit: Time limit in seconds + memory_limit: Memory limit in MB + cores: List of core IDs to assign to this run (e.g., [0, 1] for cores 0 and 1) + + The python native approach to setting resource limits does not require spawning a separate process for the instance run. + As a downside, it offers less control over the resources and is less robust. + """ + # Automatically add WriteToFileObserver if output_file is provided + if output_file is not None: + from functools import partial + from cpmpy.tools.benchmark.test.observer import WriteToFileObserver + runner.register_observer(partial(WriteToFileObserver, output_file=output_file, overwrite=True)) + # Programmatically add ResourceLimitObserver if limits are provided if time_limit is not None or memory_limit is not None: # Add a resource observer with limits @@ -320,26 +396,25 @@ def run(self, instance: str, runner: InstanceRunner, time_limit: int, memory_lim runner.register_observer(resource_observer) # Run the instance using the runner's run method - runner.run(instance=instance, time_limit=time_limit, mem_limit=memory_limit, cores=len(cores) if cores else None) + runner.run(instance=instance, solver=solver, seed=seed, intermediate=intermediate, verbose=verbose, output_file=output_file, time_limit=time_limit, mem_limit=memory_limit, cores=len(cores) if cores else None) -def run_instance(instance: str, instance_runner: InstanceRunner, time_limit: int, memory_limit: int, cores: list[int], resource_manager: ResourceManager): - - +def run_instance(instance: str, instance_runner: InstanceRunner, time_limit: int, memory_limit: int, cores: list[int], resource_manager: ResourceManager, solver: str, seed: int, intermediate: bool, verbose: bool, output_file: str): """ Run a single instance with assigned cores. - Args: + Arguments: instance: Instance file path + instance_runner: Instance runner time_limit: Time limit in seconds memory_limit: Memory limit in MB cores: List of core IDs to assign to this run (e.g., [0, 1] for cores 0 and 1) """ - resource_manager.run(instance, instance_runner, time_limit, memory_limit, cores) + resource_manager.run(instance, instance_runner, time_limit, memory_limit, cores, solver, seed, intermediate, verbose, output_file) # Convert cores list to comma-separated string for runexec @@ -361,7 +436,7 @@ def load_instance_runner(runner_path: str) -> InstanceRunner: """ Load an instance runner class from a module path. - Args: + Arguments: runner_path: Path to the instance runner class, e.g., "cpmpy.tools.benchmark.test.xcsp3_instance_runner.XCSP3InstanceRunner" or a file path like "/path/to/module.py:ClassName" diff --git a/cpmpy/tools/benchmark/test/run_benchmark.py b/cpmpy/tools/benchmark/test/run_benchmark.py new file mode 100644 index 000000000..5ed12a5d3 --- /dev/null +++ b/cpmpy/tools/benchmark/test/run_benchmark.py @@ -0,0 +1,1025 @@ +#!/usr/bin/env python3 +""" +Generic CLI for running benchmarks with any InstanceRunner. + +This script provides a flexible command-line interface for running benchmarks +with configurable runners, observers, and run settings. + +Usage Examples: + # Run a single instance + python run_benchmark.py instance.xml --runner xcsp3 --solver ortools + + # Run a single instance with output file + python run_benchmark.py instance.xml --runner xcsp3 --solver ortools --output /path/to/output.txt + + # Run with custom observers + python run_benchmark.py instance.xml --runner xcsp3 --observers CompetitionPrintingObserver RuntimeObserver + + # Run with observer constructor arguments + python run_benchmark.py instance.xml --runner xcsp3 --observers "WriteToFileObserver(output_file=\"/path/to/file.txt\", overwrite=False)" + + # Run a batch of instances in parallel with output directory + python run_benchmark.py --batch instances.txt --runner xcsp3 --workers 4 --output ./results + + # Run a dataset with output directory + python run_benchmark.py --dataset cpmpy.tools.dataset.model.xcsp3.XCSP3Dataset --dataset-year 2024 --dataset-track COP --dataset-download --runner xcsp3 --output ./results + + # Run a dataset with custom root directory + python run_benchmark.py --dataset cpmpy.tools.dataset.model.xcsp3.XCSP3Dataset --dataset-year 2024 --dataset-track CSP --dataset-root ./data --runner xcsp3 --workers 4 --output ./results + + # Load a custom runner + python run_benchmark.py instance.xml --runner cpmpy.tools.benchmark.test.xcsp3_instance_runner.XCSP3InstanceRunner +""" + +import argparse +import importlib +import sys +import ast +from typing import List, Optional, Dict, Any +from concurrent.futures import ProcessPoolExecutor +from multiprocessing import Manager + +from cpmpy.tools.benchmark.test.instance_runner import InstanceRunner +from cpmpy.tools.benchmark.test.manager import load_instance_runner, run_instance, RunExecResourceManager +from cpmpy.tools.benchmark.test.observer import ( + Observer, + CompetitionPrintingObserver, + HandlerObserver, + LoggerObserver, + ResourceLimitObserver, + SolverArgsObserver, + RuntimeObserver, + SolutionCheckerObserver, + WriteToFileObserver, +) + + +# Map of observer names to classes +# Note: WriteToFileObserver is not included here as it requires a file_path argument +# Use format "WriteToFileObserver:/path/to/file.txt" if needed, or omit it +# (output files are automatically created in results/ directory via output_file parameter) +OBSERVER_CLASSES = { + "CompetitionPrintingObserver": CompetitionPrintingObserver, + "HandlerObserver": HandlerObserver, + "LoggerObserver": LoggerObserver, + "ResourceLimitObserver": ResourceLimitObserver, + "SolverArgsObserver": SolverArgsObserver, + "RuntimeObserver": RuntimeObserver, + "SolutionCheckerObserver": SolutionCheckerObserver, +} + +# Aliases for shorter names +OBSERVER_ALIASES = { + "WriteToFile": "WriteToFileObserver", + "Competition": "CompetitionPrintingObserver", + "Handler": "HandlerObserver", + "Logger": "LoggerObserver", + "ResourceLimit": "ResourceLimitObserver", + "SolverArgs": "SolverArgsObserver", + "Runtime": "RuntimeObserver", + "SolutionChecker": "SolutionCheckerObserver", +} + + +def parse_observer_with_args(observer_spec: str) -> tuple[str, Dict[str, Any]]: + """ + Parse an observer specification that may include constructor arguments. + + Supports formats: + - "ObserverClass" -> ("ObserverClass", {}) + - "module.path.ObserverClass" -> ("module.path.ObserverClass", {}) + - "ObserverClass(arg1=val1,arg2=val2)" -> ("ObserverClass", {"arg1": val1, "arg2": val2}) + - "module.path.ObserverClass(arg1=val1,arg2=val2)" -> ("module.path.ObserverClass", {"arg1": val1, "arg2": val2}) + + Arguments: + observer_spec: Observer specification string + + Returns: + Tuple of (observer_path, kwargs_dict) + """ + # Check if there are constructor arguments + # Match pattern: classname(...) where ... can contain nested parentheses + # We need to find the last opening parenthesis that matches a closing one + paren_pos = observer_spec.rfind('(') + if paren_pos != -1 and observer_spec.endswith(')'): + observer_path = observer_spec[:paren_pos] + args_str = observer_spec[paren_pos + 1:-1] # Remove the parentheses + + # Parse the arguments string into a dict + kwargs = {} + if args_str.strip(): + # Use ast.literal_eval to safely parse the arguments + # Wrap in braces to make it a dict literal + try: + parsed = ast.literal_eval(f"{{{args_str}}}") + if isinstance(parsed, dict): + kwargs = parsed + else: + raise ValueError(f"Invalid argument format: {args_str}. Expected key=value pairs") + except (ValueError, SyntaxError): + # If that fails, try manual parsing for key=value pairs + # This handles cases where values might have commas or special characters + for pair in args_str.split(','): + pair = pair.strip() + if '=' in pair: + # Find the first = sign (key=value) + eq_pos = pair.find('=') + key = pair[:eq_pos].strip() + value = pair[eq_pos + 1:].strip() + # Try to parse the value + try: + # Try as literal (bool, int, float, None, string) + parsed_value = ast.literal_eval(value) + except (ValueError, SyntaxError): + # If that fails, treat as string (remove quotes if present) + if (value.startswith('"') and value.endswith('"')) or \ + (value.startswith("'") and value.endswith("'")): + parsed_value = value[1:-1] + else: + parsed_value = value + kwargs[key] = parsed_value + else: + raise ValueError(f"Invalid argument format: {pair}. Expected 'key=value'") + + return observer_path, kwargs + else: + return observer_spec, {} + + +def load_observer(observer_name: str) -> Observer: + """ + Load an observer by name or module path, optionally with constructor arguments. + + Arguments: + observer_name: Either a simple name (e.g., "CompetitionPrintingObserver") + or a full module path (e.g., "cpmpy.tools.benchmark.test.observer.CompetitionPrintingObserver") + or a file path (e.g., "/path/to/file.py:ClassName" or "path/to/file.py::ClassName") + or with arguments (e.g., "WriteToFileObserver(file_path='/path/to/file.txt')") + For WriteToFileObserver, use format "WriteToFileObserver:file_path" or provide file_path separately + + Returns: + Observer instance + """ + import importlib.util + from pathlib import Path + + # Parse observer name and arguments + observer_path, kwargs = parse_observer_with_args(observer_name) + + # Resolve aliases at the top level (e.g., "WriteToFile" -> "WriteToFileObserver") + if observer_path in OBSERVER_ALIASES: + observer_path = OBSERVER_ALIASES[observer_path] + + # Check for file path format: /path/to/file.py:ClassName or path/to/file.py::ClassName + # Also handle module.path.to.file.py::ClassName (convert to module path) + if "::" in observer_path or ("::" not in observer_path and ".py:" in observer_path): + # Split on :: or : (but not :/ for absolute paths on Windows) + if "::" in observer_path: + file_part, class_name = observer_path.rsplit("::", 1) + else: + file_part, class_name = observer_path.rsplit(":", 1) + + # Resolve alias for class name (e.g., "WriteToFile" -> "WriteToFileObserver") + if class_name in OBSERVER_ALIASES: + class_name = OBSERVER_ALIASES[class_name] + + # Convert to module path format if it looks like module.path.file.py + if ".py" in file_part and not file_part.startswith("/") and not file_part.startswith("."): + # Format: cpmpy.tools.benchmark.test.observer.py -> cpmpy.tools.benchmark.test.observer + module_path = file_part.replace(".py", "") + try: + module = importlib.import_module(module_path) + observer_class = getattr(module, class_name) + if not issubclass(observer_class, Observer): + raise ValueError(f"{observer_class} is not a subclass of Observer") + + # Handle WriteToFileObserver special case + if class_name == "WriteToFileObserver": + if "file_path" in kwargs and "output_file" not in kwargs: + kwargs["output_file"] = kwargs.pop("file_path") + if "output_file" not in kwargs: + # Default output file + kwargs["output_file"] = "results/output.txt" + + return observer_class(**kwargs) + except (ImportError, AttributeError) as e: + raise ValueError(f"Could not load observer '{observer_path}': {e}") + + # Handle actual file paths + file_path = Path(file_part).resolve() + if file_path.exists(): + # Add parent directory to sys.path if needed + parent_dir = str(file_path.parent) + if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) + + # Import the module + module_name = file_path.stem + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # Get the class + observer_class = getattr(module, class_name) + if not issubclass(observer_class, Observer): + raise ValueError(f"{observer_class} is not a subclass of Observer") + + # Handle WriteToFileObserver special case + if class_name == "WriteToFileObserver": + if "file_path" in kwargs and "output_file" not in kwargs: + kwargs["output_file"] = kwargs.pop("file_path") + if "output_file" not in kwargs: + # Default output file + kwargs["output_file"] = "results/output.txt" + + return observer_class(**kwargs) + + # Special handling for WriteToFileObserver + if observer_path.startswith("WriteToFileObserver") or observer_path.endswith("WriteToFileObserver"): + if ":" in observer_path and "::" not in observer_path: + # Format: WriteToFileObserver:/path/to/file.txt (legacy format) + _, file_path = observer_path.split(":", 1) + kwargs["output_file"] = file_path + return WriteToFileObserver(**kwargs) + # Support both output_file and file_path for backward compatibility + if "file_path" in kwargs and "output_file" not in kwargs: + kwargs["output_file"] = kwargs.pop("file_path") + if "output_file" not in kwargs: + # Default output file + kwargs["output_file"] = "results/output.txt" + return WriteToFileObserver(**kwargs) + + # Check if it's a known observer name + if observer_path in OBSERVER_CLASSES: + observer_class = OBSERVER_CLASSES[observer_path] + return observer_class(**kwargs) + + # Try to load from module path + if "." in observer_path: + module_path, class_name = observer_path.rsplit(".", 1) + try: + module = importlib.import_module(module_path) + observer_class = getattr(module, class_name) + if not issubclass(observer_class, Observer): + raise ValueError(f"{observer_class} is not a subclass of Observer") + # Check if it's WriteToFileObserver loaded via module path + if class_name == "WriteToFileObserver": + # Support both output_file and file_path for backward compatibility + if "file_path" in kwargs and "output_file" not in kwargs: + kwargs["output_file"] = kwargs.pop("file_path") + if "output_file" not in kwargs: + # Default output file + kwargs["output_file"] = "results/output.txt" + return observer_class(**kwargs) + except (ImportError, AttributeError) as e: + raise ValueError(f"Could not load observer '{observer_path}': {e}") + + raise ValueError(f"Unknown observer: {observer_path}. Available: {', '.join(OBSERVER_CLASSES.keys())}") + + +def load_observers(observer_names: Optional[List[str]]) -> List[Observer]: + """ + Load multiple observers from a list of names. + + Arguments: + observer_names: List of observer names or module paths + + Returns: + List of Observer instances + """ + if not observer_names: + return [] + + observers = [] + for name in observer_names: + observers.append(load_observer(name)) + + return observers + + +def run_single_instance( + instance: str, + runner: InstanceRunner, + solver: str = "ortools", + time_limit: Optional[float] = None, + mem_limit: Optional[int] = None, + seed: Optional[int] = None, + cores: Optional[int] = None, + intermediate: bool = False, + verbose: bool = False, + output_file: Optional[str] = None, + additional_observers: Optional[List[Observer]] = None, +): + """ + Run a single instance with the given runner and settings. + """ + # Automatically add WriteToFileObserver if output_file is provided + if output_file is not None: + # Ensure the output file path is absolute or properly constructed + from cpmpy.tools.benchmark.test.instance_runner import create_output_file + from functools import partial + output_file = create_output_file(output_file, None, solver, instance) + runner.register_observer(partial(WriteToFileObserver, output_file=output_file, overwrite=True)) + + # Register additional observers + if additional_observers: + for observer in additional_observers: + runner.register_observer(observer) + + # Run the instance + runner.run( + instance=instance, + solver=solver, + time_limit=time_limit, + mem_limit=mem_limit, + seed=seed, + cores=cores, + intermediate=intermediate, + verbose=verbose, + output_file=output_file, + ) + + +def worker_function(worker_id, cores, job_queue, time_limit, memory_limit, runner_path, solver, seed, intermediate, verbose, output_dir): + """Worker function for parallel execution.""" + resource_manager = RunExecResourceManager() + + while True: + try: + instance, metadata = job_queue.get_nowait() + except Exception: + break + + # Create a fresh instance_runner for each instance to avoid observer accumulation + instance_runner = load_instance_runner(runner_path) + + # Construct output_file path for this instance + output_file = None + if output_dir is not None: + from cpmpy.tools.benchmark.test.instance_runner import create_output_file + # Extract instance name for filename + import os + instance_name = os.path.splitext(os.path.basename(instance))[0] + output_file = create_output_file(None, output_dir, solver, instance_name) + # Note: WriteToFileObserver will be automatically added by the resource manager + + run_instance( + instance, + instance_runner, + time_limit, + memory_limit, + cores, + resource_manager, + solver, + seed, + intermediate, + verbose, + output_file, + ) + job_queue.task_done() + + +def compute_workers_and_memory( + workers: Optional[int], + total_memory: Optional[int], + memory_per_worker: Optional[int], + ignore_check: bool = False, +) -> tuple[int, Optional[int]]: + """ + Compute workers and memory_per_worker from the given parameters. + + Derives whichever value is missing: + - If total_memory and memory_per_worker are set, derive workers + - If total_memory and workers are set, derive memory_per_worker + - If memory_per_worker and workers are set, derive total_memory (but return memory_per_worker) + + If total_memory is not provided, it will be automatically measured from the system. + + If all are set, checks feasibility: total_memory == workers * memory_per_worker + + Arguments: + workers: Number of workers (None to derive) + total_memory: Total memory in MiB (None to derive or measure) + memory_per_worker: Memory per worker in MiB (None to derive) + ignore_check: If True, ignore feasibility check and just warn + + Returns: + Tuple of (workers, memory_per_worker) + """ + import psutil + + # If total_memory is not provided, measure it from the system + if total_memory is None: + # Get total virtual memory in bytes and convert to MiB + total_memory = psutil.virtual_memory().total // (1024 * 1024) + + # Count how many values are set (now total_memory is always set) + set_count = sum(1 for x in [workers, memory_per_worker] if x is not None) + + if set_count == 0: + # Defaults: 1 worker, no memory limit per worker + return 1, None + + if set_count == 1: + # Only one value set - derive the other from total_memory + if workers is not None: + # Derive memory_per_worker from total_memory and workers + if total_memory % workers != 0: + raise ValueError( + f"Measured total-memory ({total_memory} MiB) is not evenly divisible by " + f"workers ({workers})" + ) + memory_per_worker = total_memory // workers + if memory_per_worker < 1: + raise ValueError( + f"Derived memory-per-worker ({memory_per_worker} MiB) must be at least 1. " + f"Check your workers value relative to available memory ({total_memory} MiB)." + ) + return workers, memory_per_worker + else: # memory_per_worker is not None + # Derive workers from total_memory and memory_per_worker + if total_memory % memory_per_worker != 0: + raise ValueError( + f"Measured total-memory ({total_memory} MiB) is not evenly divisible by " + f"memory-per-worker ({memory_per_worker} MiB)" + ) + workers = total_memory // memory_per_worker + if workers < 1: + raise ValueError( + f"Derived workers ({workers}) must be at least 1. " + f"Check your memory-per-worker value relative to available memory ({total_memory} MiB)." + ) + return workers, memory_per_worker + + if set_count == 2: + # Both workers and memory_per_worker are provided - use them as-is + # Derive total_memory for validation only (don't override user input) + expected_total = workers * memory_per_worker + if total_memory < expected_total: + # Warn if measured total is less than what's needed + message = ( + f"Memory configuration: workers ({workers}) × memory-per-worker ({memory_per_worker} MiB) = " + f"{expected_total} MiB, but measured total-memory is {total_memory} MiB. " + f"Using specified memory-per-worker ({memory_per_worker} MiB) anyway." + ) + print(f"WARNING: {message}", file=sys.stderr) + # Use the user-provided values as-is - manual input always takes precedence + return workers, memory_per_worker + + else: # set_count == 3, all values are set + # Check feasibility + expected_total = workers * memory_per_worker + if total_memory != expected_total: + message = ( + f"Memory configuration is not feasible: " + f"workers ({workers}) × memory-per-worker ({memory_per_worker} MiB) = " + f"{expected_total} MiB, but total-memory is {total_memory} MiB" + ) + if ignore_check: + print(f"WARNING: {message}. Continuing anyway...", file=sys.stderr) + else: + raise ValueError(message + ". Use --ignore-memory-check to override.") + + return workers, memory_per_worker + + +def run_batch( + instances: List[str], + runner_path: str, + solver: str = "ortools", + time_limit: Optional[float] = None, + mem_limit: Optional[int] = None, + seed: Optional[int] = None, + workers: Optional[int] = None, + cores_per_worker: int = 1, + total_memory: Optional[int] = None, + memory_per_worker: Optional[int] = None, + ignore_memory_check: bool = False, + intermediate: bool = False, + verbose: bool = False, + output_dir: Optional[str] = None, +): + """ + Run a batch of instances in parallel. + """ + import psutil + + # Store original user inputs to preserve manual overrides + original_memory_per_worker = memory_per_worker + original_workers = workers + + # Compute workers and memory_per_worker from the given parameters + computed_workers, computed_memory_per_worker = compute_workers_and_memory( + workers, total_memory, memory_per_worker, ignore_memory_check + ) + + # Use computed workers (unless both were provided, then use original) + if original_workers is not None and original_memory_per_worker is not None: + # Both were provided - use original values (compute_workers_and_memory already returns them) + workers = computed_workers + memory_per_worker = computed_memory_per_worker + else: + # Use computed values + workers = computed_workers + if original_memory_per_worker is not None: + # User explicitly provided memory_per_worker - use it + memory_per_worker = original_memory_per_worker + else: + memory_per_worker = computed_memory_per_worker + + # Use memory_per_worker as mem_limit if not explicitly set + # But if user explicitly set mem_limit, that takes precedence + if mem_limit is None and memory_per_worker is not None: + mem_limit = memory_per_worker + + total_cores = psutil.cpu_count(logical=False) + + if workers * cores_per_worker > total_cores: + raise ValueError( + f"Not enough cores: {workers} workers × {cores_per_worker} cores = " + f"{workers * cores_per_worker} cores needed, but only {total_cores} available" + ) + + # Assign cores to each worker + worker_cores = [] + for i in range(workers): + start_core = i * cores_per_worker + end_core = start_core + cores_per_worker + cores = list(range(start_core, end_core)) + worker_cores.append(cores) + + if verbose: + print(f"Total cores: {total_cores}, Workers: {workers}, Cores per worker: {cores_per_worker}") + for i, cores in enumerate(worker_cores): + print(f"Worker {i}: cores {cores}") + + # Create a queue of all jobs + with Manager() as manager: + job_queue = manager.Queue() + for instance in instances: + job_queue.put((instance, {})) + + # Submit workers to the executor + with ProcessPoolExecutor(max_workers=workers) as executor: + futures = [ + executor.submit( + worker_function, + worker_id, + cores, + job_queue, + time_limit, + mem_limit, + runner_path, + solver, + seed, + intermediate, + verbose, + output_dir, + ) + for worker_id, cores in enumerate(worker_cores) + ] + # Wait for all workers to finish + for future in futures: + future.result() + + +def parse_instance_list(file_path: str) -> List[str]: + """Parse a file containing instance paths (one per line).""" + with open(file_path, 'r') as f: + instances = [line.strip() for line in f if line.strip() and not line.strip().startswith('#')] + return instances + + +def load_dataset(dataset_path: str, dataset_kwargs: dict): + """ + Load a dataset class and instantiate it with the given kwargs. + + Arguments: + dataset_path: Path to the dataset class, e.g., + "cpmpy.tools.dataset.model.xcsp3.XCSP3Dataset" + or a file path like "/path/to/dataset.py:ClassName" + dataset_kwargs: Dictionary of keyword arguments to pass to the dataset constructor + + Returns: + Dataset instance + """ + import importlib.util + from pathlib import Path + + if ":" in dataset_path: + # Format: /path/to/dataset.py:ClassName + file_path, class_name = dataset_path.rsplit(":", 1) + file_path = Path(file_path).resolve() + + # Add parent directory to sys.path if needed + parent_dir = str(file_path.parent) + if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) + + # Import the module + module_name = file_path.stem + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # Get the class + dataset_class = getattr(module, class_name) + elif "." in dataset_path: + # Format: module.path.ClassName + module_path, class_name = dataset_path.rsplit(".", 1) + module = importlib.import_module(module_path) + dataset_class = getattr(module, class_name) + else: + raise ValueError(f"Invalid dataset path format: {dataset_path}. Use 'module.path.ClassName' or '/path/to/file.py:ClassName'") + + # Instantiate the dataset with the provided kwargs + return dataset_class(**dataset_kwargs) + + +def main(): + parser = argparse.ArgumentParser( + description="Generic CLI for running benchmarks with any InstanceRunner", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__ + ) + + # Instance input - use optional positional and check manually to avoid argparse issues + parser.add_argument( + "instance", + nargs="?", + type=str, + help="Path to a single instance file to run" + ) + parser.add_argument( + "--batch", + type=str, + metavar="FILE", + help="Path to a file containing instance paths (one per line) for batch processing" + ) + parser.add_argument( + "--dataset", + type=str, + metavar="DATASET_CLASS", + help="Dataset class to use. Can be a full module path " + "(e.g., 'cpmpy.tools.dataset.model.xcsp3.XCSP3Dataset') " + "or a file path (e.g., '/path/to/dataset.py:ClassName')" + ) + + # Runner configuration + parser.add_argument( + "--runner", + type=str, + default="xcsp3", + help="InstanceRunner to use. Can be a simple name (e.g., 'xcsp3') or a full module path " + "(e.g., 'cpmpy.tools.benchmark.test.xcsp3_instance_runner.XCSP3InstanceRunner') " + "or a file path (e.g., '/path/to/runner.py:ClassName')" + ) + + # Observer configuration + parser.add_argument( + "--observers", + type=str, + nargs="+", + default=None, + metavar="OBSERVER", + help="Additional observers to register. Can specify multiple. " + "Available: " + ", ".join(OBSERVER_CLASSES.keys()) + ". " + "Or use full module path like 'cpmpy.tools.benchmark.test.observer.CompetitionPrintingObserver'. " + "To pass constructor arguments, use format 'ObserverClass(arg1=val1,arg2=val2)'. " + "Example: 'WriteToFileObserver(file_path=\"/path/to/file.txt\", overwrite=False)'. " + "Note: WriteToFileObserver is automatically added to write outputs to results/ directory. " + "To use a custom file path, use format 'WriteToFileObserver(file_path=\"/path/to/file.txt\")'" + ) + + # Solver settings + parser.add_argument( + "--solver", + type=str, + default="ortools", + help="Solver to use (default: ortools)" + ) + + # Run settings + parser.add_argument( + "--time_limit", + type=float, + default=None, + help="Time limit in seconds" + ) + parser.add_argument( + "--mem_limit", + type=int, + default=None, + help="Memory limit in MiB" + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Random seed" + ) + parser.add_argument( + "--cores", + type=int, + default=None, + help="Number of CPU cores to use (for single instance)" + ) + parser.add_argument( + "--intermediate", + action="store_true", + help="Print intermediate solutions" + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Verbose output" + ) + parser.add_argument( + "--output", + type=str, + default=None, + help="Output path: for single instance, this is the output file path; " + "for batch/dataset, this is the directory where output files will be placed" + ) + + # Batch processing settings + parser.add_argument( + "--workers", + type=int, + default=None, + help="Number of parallel workers for batch processing. If not set, will be derived from " + "total-memory and memory-per-worker if those are set." + ) + parser.add_argument( + "--cores_per_worker", + type=int, + default=1, + help="Number of cores per worker for batch processing (default: 1)" + ) + parser.add_argument( + "--total-memory", + type=int, + default=None, + metavar="MiB", + help="Total memory available in MiB. If set along with memory-per-worker, will derive " + "number of workers. If set along with workers, will derive memory-per-worker." + ) + parser.add_argument( + "--memory-per-worker", + type=int, + default=None, + metavar="MiB", + help="Memory per worker in MiB. If set along with total-memory, will derive " + "number of workers. If set along with workers, will derive total-memory." + ) + parser.add_argument( + "--ignore-memory-check", + action="store_true", + help="Ignore feasibility check when all memory/worker parameters are set. " + "Will print a warning if configuration is not feasible but still allow the run to start." + ) + + # Dataset configuration options + parser.add_argument( + "--dataset-root", + type=str, + default="./data", + help="Root directory for dataset (default: './data')" + ) + parser.add_argument( + "--dataset-year", + type=int, + default=None, + help="Year for dataset (e.g., 2024 for XCSP3Dataset)" + ) + parser.add_argument( + "--dataset-track", + type=str, + default=None, + help="Track for dataset (e.g., 'COP', 'CSP' for XCSP3Dataset)" + ) + parser.add_argument( + "--dataset-download", + action="store_true", + help="Download dataset if not available locally" + ) + parser.add_argument( + "--dataset-variant", + type=str, + default=None, + help="Variant for dataset (e.g., for PSPLibDataset)" + ) + parser.add_argument( + "--dataset-family", + type=str, + default=None, + help="Family for dataset (e.g., for PSPLibDataset)" + ) + parser.add_argument( + "--dataset-option", + type=str, + nargs=2, + action="append", + metavar=("KEY", "VALUE"), + help="Additional dataset options as key-value pairs. Can be specified multiple times. " + "Example: --dataset-option transform my_transform --dataset-option target_transform my_target" + ) + + args = parser.parse_args() + + # Check that exactly one of instance, --batch, or --dataset is provided + provided = sum([args.instance is not None, args.batch is not None, args.dataset is not None]) + if provided == 0: + parser.error("One of 'instance', '--batch', or '--dataset' must be provided") + elif provided > 1: + parser.error("Only one of 'instance', '--batch', or '--dataset' can be provided") + + # Load the runner + try: + if args.runner == "xcsp3": + # Special case for xcsp3 + from cpmpy.tools.benchmark.test.run_xcsp3_instance import XCSP3InstanceRunner + runner = XCSP3InstanceRunner() + else: + runner = load_instance_runner(args.runner) + except Exception as e: + print(f"Error loading runner '{args.runner}': {e}", file=sys.stderr) + sys.exit(1) + + # Load observers + additional_observers = None + if args.observers: + try: + additional_observers = load_observers(args.observers) + except Exception as e: + print(f"Error loading observers: {e}", file=sys.stderr) + sys.exit(1) + + # Run single instance, batch, or dataset + if args.dataset: + # Dataset processing + try: + # Build dataset kwargs from arguments + dataset_kwargs = {} + + # Common parameters + # Always set root (default is "./data") + dataset_kwargs["root"] = args.dataset_root + if args.dataset_download: + dataset_kwargs["download"] = True + + # Year/track parameters (for XCSP3Dataset, OPBDataset, etc.) + if args.dataset_year is not None: + dataset_kwargs["year"] = args.dataset_year + if args.dataset_track: + dataset_kwargs["track"] = args.dataset_track + + # Variant/family parameters (for PSPLibDataset, etc.) + if args.dataset_variant: + dataset_kwargs["variant"] = args.dataset_variant + if args.dataset_family: + dataset_kwargs["family"] = args.dataset_family + + # Additional options from --dataset-option + if args.dataset_option: + for key, value in args.dataset_option: + # Try to convert value to appropriate type + try: + # Try int first + value = int(value) + except ValueError: + try: + # Try float + value = float(value) + except ValueError: + # Try bool + if value.lower() in ("true", "false"): + value = value.lower() == "true" + # Otherwise keep as string + dataset_kwargs[key] = value + + # Load and instantiate the dataset + dataset = load_dataset(args.dataset, dataset_kwargs) + + # Get instances from dataset + instances = [] + for instance, metadata in dataset: + instances.append(instance) + + if not instances: + print("No instances found in dataset", file=sys.stderr) + sys.exit(1) + + # Compute workers and memory configuration + workers, memory_per_worker = compute_workers_and_memory( + args.workers, args.total_memory, args.memory_per_worker, args.ignore_memory_check + ) + + if args.verbose: + # Get the actual total memory used (may have been measured) + import psutil + actual_total = args.total_memory if args.total_memory is not None else psutil.virtual_memory().total // (1024 * 1024) + if args.total_memory: + print(f"Total memory: {args.total_memory} MiB (user-specified)") + else: + print(f"Total memory: {actual_total} MiB (measured from system)") + if memory_per_worker: + print(f"Memory per worker: {memory_per_worker} MiB") + print(f"Running {len(instances)} instances from dataset with {workers} workers") + + run_batch( + instances=instances, + runner_path=args.runner, + solver=args.solver, + time_limit=args.time_limit, + mem_limit=args.mem_limit if args.mem_limit is not None else memory_per_worker, + seed=args.seed, + workers=workers, + cores_per_worker=args.cores_per_worker, + total_memory=args.total_memory, + memory_per_worker=memory_per_worker, + ignore_memory_check=args.ignore_memory_check, + intermediate=args.intermediate, + verbose=args.verbose, + output_dir=args.output, + ) + except Exception as e: + print(f"Error running dataset: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + elif args.batch: + # Batch processing + try: + instances = parse_instance_list(args.batch) + if not instances: + print(f"No instances found in {args.batch}", file=sys.stderr) + sys.exit(1) + + # Compute workers and memory configuration + workers, memory_per_worker = compute_workers_and_memory( + args.workers, args.total_memory, args.memory_per_worker, args.ignore_memory_check + ) + + if args.verbose: + # Get the actual total memory used (may have been measured) + import psutil + actual_total = args.total_memory if args.total_memory is not None else psutil.virtual_memory().total // (1024 * 1024) + if args.total_memory: + print(f"Total memory: {args.total_memory} MiB (user-specified)") + else: + print(f"Total memory: {actual_total} MiB (measured from system)") + if memory_per_worker: + print(f"Memory per worker: {memory_per_worker} MiB") + print(f"Running {len(instances)} instances with {workers} workers") + + run_batch( + instances=instances, + runner_path=args.runner, + solver=args.solver, + time_limit=args.time_limit, + mem_limit=args.mem_limit if args.mem_limit is not None else memory_per_worker, + seed=args.seed, + workers=workers, + cores_per_worker=args.cores_per_worker, + total_memory=args.total_memory, + memory_per_worker=memory_per_worker, + ignore_memory_check=args.ignore_memory_check, + intermediate=args.intermediate, + verbose=args.verbose, + output_dir=args.output, + ) + except Exception as e: + print(f"Error running batch: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + else: + # Single instance + if not args.instance: + parser.error("Either provide an instance path or use --batch") + + try: + run_single_instance( + instance=args.instance, + runner=runner, + solver=args.solver, + time_limit=args.time_limit, + mem_limit=args.mem_limit, + seed=args.seed, + cores=args.cores, + intermediate=args.intermediate, + verbose=args.verbose, + output_file=args.output, + additional_observers=additional_observers, + ) + except Exception as e: + print(f"Error running instance: {e}", file=sys.stderr) + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/cpmpy/tools/benchmark/test/run_xcsp3.py b/cpmpy/tools/benchmark/test/run_xcsp3.py deleted file mode 100644 index c66e57d55..000000000 --- a/cpmpy/tools/benchmark/test/run_xcsp3.py +++ /dev/null @@ -1,58 +0,0 @@ -""" -Deprecated: Use XCSP3InstanceRunner instead -""" -import argparse -import lzma -from pathlib import Path -from functools import partial - -from cpmpy.tools.benchmark.test.runner import Runner, CompetitionPrintingObserver, ProfilingObserver, HandlerObserver, SolverArgsObserver, SolutionCheckerObserver, WriteToFileObserver -from cpmpy.tools.dataset.model.xcsp3 import XCSP3Dataset -from cpmpy.tools.xcsp3 import read_xcsp3 - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("instance", type=str) - parser.add_argument("--verbose", action="store_true", default=False) - parser.add_argument("--solver", type=str, default="ortools") - parser.add_argument("--time_limit", type=int, default=None) - parser.add_argument("--mem_limit", type=int, default=None) - parser.add_argument("--seed", type=int, default=None) - parser.add_argument("--intermediate", action="store_true", default=False) - parser.add_argument("--cores", type=int, default=None) - parser.add_argument("--output_file", type=str, default=None) - # parser.add_argument("--kwargs", type=str, default="") - parser.add_argument("--observers", type=list[str], default=None) - - args = parser.parse_args() - - - if args.output_file is None: - args.output_file = f"results/{args.solver}_{args.instance}.txt" - else: - args.output_file = f"results/{args.output_file}" - - Path(args.output_file).parent.mkdir(parents=True, exist_ok=True) - - - # dataset = XCSP3Dataset(root="./data", year=2024, track="CSP24", download=True) - - runner = Runner(reader=partial(read_xcsp3, open= lambda instance: lzma.open(instance, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open(instance))) - # runner.register_observer(LoggerObserver()) - runner.register_observer(CompetitionPrintingObserver()) - runner.register_observer(ProfilingObserver()) - # runner.register_observer(ResourceLimitObserver(time_limit=args.time_limit, mem_limit=args.mem_limit)) - runner.register_observer(HandlerObserver()) - runner.register_observer(SolverArgsObserver()) - runner.register_observer(SolutionCheckerObserver()) - #runner.register_observer(WriteToFileObserver(file_path=args.output_file)) - - for observer in args.observers: - pass - - - print(vars(args)) - runner.run(**vars(args)) - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/cpmpy/tools/benchmark/test/bench_xcsp3.py b/cpmpy/tools/benchmark/test/run_xcsp3_benchmark.py similarity index 94% rename from cpmpy/tools/benchmark/test/bench_xcsp3.py rename to cpmpy/tools/benchmark/test/run_xcsp3_benchmark.py index e8a1caa30..dbec67822 100644 --- a/cpmpy/tools/benchmark/test/bench_xcsp3.py +++ b/cpmpy/tools/benchmark/test/run_xcsp3_benchmark.py @@ -2,7 +2,7 @@ from multiprocessing import Manager from cpmpy.tools.benchmark.test.manager import RunExecResourceManager, run_instance -from cpmpy.tools.benchmark.test.xcsp3_instance_runner import XCSP3InstanceRunner +from cpmpy.tools.benchmark.test.run_xcsp3_instance import XCSP3InstanceRunner def worker_function(worker_id, cores, job_queue, time_limit, memory_limit): @@ -30,12 +30,12 @@ def main(): # dataset = XCSP3Dataset(root="./data", year=2025, track="CSP25", download=True) # dataset = OPBDataset(root="./data", year=2024, track="DEC-LIN", download=True) # dataset = JSPLibDataset(root="./data", download=True) - dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) + dataset = XCSP3Dataset(root="./data", year=2024, track="COP", download=True) time_limit = 10*60 workers = 1 cores_per_worker = 1 - total_memory = 25000 + total_memory = 16000 # MiB memory_per_worker = total_memory // workers memory_limit = memory_per_worker# Bytes to MB # resource_manager = RunExecResourceManager() diff --git a/cpmpy/tools/benchmark/test/run_xcsp3_instance.py b/cpmpy/tools/benchmark/test/run_xcsp3_instance.py new file mode 100644 index 000000000..06cef8cab --- /dev/null +++ b/cpmpy/tools/benchmark/test/run_xcsp3_instance.py @@ -0,0 +1,48 @@ +from functools import partial +import lzma + + +from cpmpy.tools.benchmark.test.instance_runner import InstanceRunner +from cpmpy.tools.benchmark.test.observer import CompetitionPrintingObserver, HandlerObserver, RuntimeObserver, ResourceLimitObserver, Runner, SolverArgsObserver, SolutionCheckerObserver +from cpmpy.tools.xcsp3.parser import read_xcsp3 + + + + +class XCSP3InstanceRunner(InstanceRunner): + + default_observers = [ + CompetitionPrintingObserver, + RuntimeObserver, + HandlerObserver, + SolverArgsObserver, + SolutionCheckerObserver, + ResourceLimitObserver, + ] + + reader = partial(read_xcsp3, open= lambda instance: lzma.open(instance, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open(instance)) + + def cmd(self, instance: str, solver: str = "ortools", output_file: str = None, **kwargs): + cmd = self.base_cmd(instance) + if solver is not None: + cmd.append("--solver") + cmd.append(solver) + if output_file is not None: + cmd.append("--output_file") + cmd.append(output_file) + return cmd + + def print_comment(self, comment: str): + print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) + + +def main(): + runner = XCSP3InstanceRunner() + + parser = runner.argparser() + args = parser.parse_args() + + runner.run(**vars(args)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cpmpy/tools/benchmark/test/runner.py b/cpmpy/tools/benchmark/test/runner.py index a3ff3ee6c..fd3edcf59 100644 --- a/cpmpy/tools/benchmark/test/runner.py +++ b/cpmpy/tools/benchmark/test/runner.py @@ -1,35 +1,95 @@ -from abc import ABC, abstractmethod +from __future__ import annotations import psutil -from cpmpy.model import Model -import logging -import signal -import argparse import sys import warnings -import os -import time -from pathlib import Path from typing import Optional -from functools import partial import contextlib import cpmpy as cp -from cpmpy.solvers import solver_interface -from cpmpy.tools.benchmark import set_time_limit, set_memory_limit - from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus -from cpmpy.tools.benchmark.opb import solution_opb -from cpmpy.tools.benchmark import _mib_as_bytes, _wall_time, set_memory_limit, set_time_limit, _bytes_as_mb, _bytes_as_gb, disable_memory_limit +from cpmpy.tools.benchmark import _wall_time + +class ObserverContext: + """ + Context manager with registerable observers. + Upon entering the context, all context managers from the observers are entered. + """ + def __init__(self, observers: list, runner: Runner): + """ + Arguments: + observers: List of observers to register + runner: Runner instance + """ + self.observers = observers or [] + self.runner = runner + self.exit_stack = contextlib.ExitStack() + + def __enter__(self): + # Enter all context managers from observers + if self.observers: + for observer in self.observers: + cm = observer.get_context_manager(runner=self.runner) + if cm is not None: + self.exit_stack.enter_context(cm) + return self + + def __exit__(self, exc_type, exc_value, traceback): + # First, exit all context managers (in reverse order) + # This happens automatically when we exit the ExitStack + exit_result = None + if self.exit_stack: + exit_result = self.exit_stack.__exit__(exc_type, exc_value, traceback) + + if exc_type is not None and self.observers: + # An exception occurred, notify all observers + # Let observers handle it and decide if exception should be suppressed + suppress_exception = False + for observer in self.observers: + try: + # Pass exception to observer, let it handle it + result = observer.observe_exception(runner=self.runner, exc_type=exc_type, exc_value=exc_value, traceback=traceback) + # If observer returns True, it wants to suppress the exception + if result is True: + suppress_exception = True + except Exception: + # Don't let observer exceptions mask the original exception + pass + + # If any observer wants to suppress, suppress the exception + if suppress_exception: + return True + + # Always call observe_exit on all observers + if self.observers: + for observer in self.observers: + try: + observer.observe_exit(runner=self.runner) + except Exception: + # Don't let observer exceptions interfere with cleanup + pass + + # Return the exit result from ExitStack (False to propagate, True to suppress) + return exit_result if exit_result is not None else False class Runner: + """ + Generic runner with registerable observers. + """ def __init__(self, reader: callable): + """ + Arguments: + reader: Reader function to read the instance + """ self.observers = [] self.solver_args = {} self.reader = reader def register_observer(self, observer): + """ + Register an observer. + """ self.observers.append(observer) def read_instance(self, instance: str): @@ -39,6 +99,23 @@ def post_model(self, model: cp.Model, solver:str): return cp.SolverLookup.get(solver, model) def run(self, instance: str, solver: Optional[str] = None, time_limit: Optional[int] = None, mem_limit: Optional[int] = None, seed: Optional[int] = None, intermediate: bool = False, cores: int = 1, **kwargs): + """ + Run the runner. + + Arguments: + instance: Instance file path + solver: Solver to use + time_limit: Time limit in seconds + mem_limit: Memory limit in bytes + seed: Random seed + intermediate: Whether to print intermediate solutions + cores: Number of cores to use + **kwargs: Additional arguments + + Returns: + True if the instance is satisfiable, False otherwise + """ + self.solver = solver self.time_limit = time_limit self.mem_limit = mem_limit @@ -49,25 +126,25 @@ def run(self, instance: str, solver: Optional[str] = None, time_limit: Optional[ self.time_buffer = 1 self.verbose = True - with self.observer_context(): + + with self.observer_context(): # Enter all context managers from the observers self.observe_init() with self.print_forwarding_context(): self.model = self.read_instance(instance) - self.observe_pre_transform() with self.print_forwarding_context(): self.s = self.post_model(self.model, solver) self.observe_post_transform() - self.solver_args = self.participate_solver_args() + self.solver_args = self.collect_solver_args() if self.time_limit: # Get the current process p = psutil.Process() - # give solver only the remaining time + # Give solver only the remaining time time_limit = self.time_limit - _wall_time(p) - self.time_buffer if self.verbose: self.print_comment(f"{time_limit}s left to solve") @@ -94,12 +171,35 @@ def run(self, instance: str, solver: Optional[str] = None, time_limit: Optional[ self.observe_end() - #print(self.is_sat) return self.is_sat def print_comment(self, comment: str): + # Format the comment using instance_runner if available, before passing to observers + formatted_comment = comment + if hasattr(self, 'instance_runner') and self.instance_runner is not None: + # Capture the formatted output from instance_runner.print_comment + import io + import sys + old_stdout = sys.stdout + sys.stdout = io.StringIO() + try: + self.instance_runner.print_comment(comment) + formatted_comment = sys.stdout.getvalue().rstrip('\n\r') + finally: + sys.stdout = old_stdout + + # Pass formatted comment to all observers for observer in self.observers: - observer.print_comment(comment) + # Pass runner to print_comment if observer accepts it + if hasattr(observer.print_comment, '__code__'): + import inspect + sig = inspect.signature(observer.print_comment) + if 'runner' in sig.parameters: + observer.print_comment(formatted_comment, runner=self) + else: + observer.print_comment(formatted_comment) + else: + observer.print_comment(formatted_comment) @contextlib.contextmanager def print_forwarding_context(self): @@ -156,6 +256,11 @@ def warning_handler(message, category, filename, lineno, file=None, line=None): # Forward any remaining buffered output forwarder.forward_to_observers() + + # ---------------------------------------------------------------------------- # + # Observer callback hooks # + # ---------------------------------------------------------------------------- # + def observer_context(self): return ObserverContext(observers=self.observers, runner=self) @@ -171,7 +276,6 @@ def observe_post_transform(self): for observer in self.observers: observer.observe_post_transform(runner=self) - def observe_pre_solve(self): for observer in self.observers: observer.observe_pre_solve(runner=self) @@ -184,750 +288,8 @@ def observe_end(self): for observer in self.observers: observer.observe_end(runner=self) - def participate_solver_args(self): + def collect_solver_args(self): solver_args = {} for observer in self.observers: observer.participate_solver_args(runner=self, solver_args=solver_args) return solver_args - -class ObserverContext: - def __init__(self, observers: list, runner: Runner): - self.observers = observers or [] - self.runner = runner - self.exit_stack = contextlib.ExitStack() - - def __enter__(self): - # Enter all context managers from observers - if self.observers: - for observer in self.observers: - cm = observer.get_context_manager(runner=self.runner) - if cm is not None: - self.exit_stack.enter_context(cm) - return self - - def __exit__(self, exc_type, exc_value, traceback): - # First, exit all context managers (in reverse order) - # This happens automatically when we exit the ExitStack - exit_result = None - if self.exit_stack: - exit_result = self.exit_stack.__exit__(exc_type, exc_value, traceback) - - if exc_type is not None and self.observers: - # An exception occurred, notify all observers - # Let ResourceLimitObserver handle it and decide if exception should be suppressed - suppress_exception = True - for observer in self.observers: - try: - # Pass exception to observer, let it handle it - result = observer.observe_exception(runner=self.runner, exc_type=exc_type, exc_value=exc_value, traceback=traceback) - # If observer returns True, it wants to suppress the exception - if result is True: - suppress_exception = True - except Exception: - # Don't let observer exceptions mask the original exception - pass - - # If any observer wants to suppress, suppress the exception - if suppress_exception: - return True - - # Always call observe_exit on all observers - if self.observers: - for observer in self.observers: - try: - observer.observe_exit(runner=self.runner) - except Exception: - # Don't let observer exceptions interfere with cleanup - pass - - # Return the exit result from ExitStack (False to propagate, True to suppress) - return exit_result if exit_result is not None else False - -class Observer(ABC): - - def observe_init(self, runner: Runner): - pass - - def observe_pre_transform(self, runner: Runner): - pass - - def observe_post_transform(self, runner: Runner): - pass - - def observe_pre_solve(self, runner: Runner): - pass - - def observe_post_solve(self, runner: Runner): - pass - - def participate_solver_args(self, runner: Runner, solver_args: dict): - return solver_args - - def observe_exception(self, runner: Runner, exc_type, exc_value, traceback): - """ - Called when an exception occurs in the context. - - Returns: - True if the exception should be suppressed, False/None to propagate it. - """ - pass - - def observe_exit(self, runner: Runner): - pass - - def observe_end(self, runner: Runner): - pass - - def print_comment(self, comment: str): - pass - - def observe_intermediate(self, runner: Runner, objective: int): - pass - - def get_context_manager(self, runner: Runner): - """ - Return a context manager that will be entered when the ObserverContext is entered. - Return None if this observer doesn't provide a context manager. - """ - return None - -class HandlerObserver(Observer): - - def __init__(self): - self.runner = None - - def observe_init(self, runner: Runner): - self.runner = runner - signal.signal(signal.SIGINT, self._sigterm_handler) - signal.signal(signal.SIGTERM, self._sigterm_handler) - signal.signal(signal.SIGINT, self._sigterm_handler) - signal.signal(signal.SIGABRT, self._sigterm_handler) - if sys.platform != "win32": - signal.signal(signal.SIGXCPU, self._rlimit_cpu_handler) - else: - warnings.warn("Windows does not support setting SIGXCPU signal") - - def _sigterm_handler(self, _signo, _stack_frame): - exit_code = self.handle_sigterm() - print(flush=True) - os._exit(exit_code) - - def _rlimit_cpu_handler(self, _signo, _stack_frame): - # Raise TimeoutError - ObserverContext will handle notifying observers - # Don't notify here to avoid duplicates - raise TimeoutError("CPU time limit reached (SIGXCPU)") - - def handle_sigterm(self): - return 0 - - def handle_rlimit_cpu(self): - return 0 - -class LoggerObserver(Observer): - def __init__(self): - self.logger = logging.getLogger(__name__) - self.logger.setLevel(logging.INFO) - # Add a StreamHandler to output to stdout if no handlers exist - if not self.logger.handlers: - handler = logging.StreamHandler() - handler.setLevel(logging.INFO) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - handler.setFormatter(formatter) - self.logger.addHandler(handler) - - def observe_init(self, runner: Runner): - self.logger.info("Initializing runner") - - def observe_pre_transform(self, runner: Runner): - self.logger.info("Pre-transforming") - - def observe_post_transform(self, runner: Runner): - self.logger.info("Post-transforming") - - def observe_pre_solve(self, runner: Runner): - self.logger.info("Pre-solving") - - def observe_post_solve(self, runner: Runner): - self.logger.info("Post-solving") - - def print_comment(self, comment: str): - self.logger.info(comment) - -class CompetitionPrintingObserver(Observer): - - def __init__(self, verbose: bool = False): - self.verbose = verbose - - def print_comment(self, comment: str): - print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) - - def observe_post_solve(self, runner: Runner): - self.print_result(runner.s) - - def observe_intermediate(self, objective: int): - self.print_intermediate(objective) - - def print_status(self, status: str): - print('s' + chr(32) + status, end="\n", flush=True) - - def print_value(self, value: str): - print('v' + chr(32) + value, end="\n", flush=True) - - def print_objective(self, objective: int): - print('o' + chr(32) + str(objective), end="\n", flush=True) - - def print_intermediate(self, objective: int): - self.print_objective(objective) - - def print_result(self, s): - if s.status().exitstatus == CPMStatus.OPTIMAL: - self.print_objective(s.objective_value()) - self.print_value(solution_opb(s)) - self.print_status("OPTIMAL" + chr(32) + "FOUND") - elif s.status().exitstatus == CPMStatus.FEASIBLE: - self.print_objective(s.objective_value()) - self.print_value(solution_opb(s)) - self.print_status("SATISFIABLE") - elif s.status().exitstatus == CPMStatus.UNSATISFIABLE: - self.print_status("UNSATISFIABLE") - else: - self.print_comment("Solver did not find any solution within the time/memory limit") - self.print_status("UNKNOWN") - -class ResourceLimitObserver(Observer): - def __init__(self, time_limit: Optional[int] = None, mem_limit: Optional[int] = None): - self.time_limit = time_limit - self.mem_limit = mem_limit - - def observe_init(self, runner: Runner): - if self.time_limit is not None: - set_time_limit(self.time_limit) - if self.mem_limit is not None: - set_memory_limit(self.mem_limit) - - def _handle_memory_error(self, runner: Runner, mem_limit: int): - runner.print_comment(f"MemoryError raised. Reached limit of {mem_limit} MiB") - - def _handle_timeout(self, runner: Runner, time_limit: int): - if time_limit is not None: - runner.print_comment(f"TimeoutError raised. Reached limit of {time_limit} seconds") - else: - runner.print_comment(f"TimeoutError raised. CPU time limit reached") - - def observe_exception(self, runner: Runner, exc_type, exc_value, traceback): - """ - Handle exceptions related to resource limits. - Returns True to suppress the exception after handling. - """ - if exc_type is MemoryError: - # Only handle if we have a memory limit set - if self.mem_limit is not None: - self._handle_memory_error(runner=runner, mem_limit=self.mem_limit) - return True # Suppress the exception after handling - elif exc_type is TimeoutError: - # Only handle if we have a time limit set - if self.time_limit is not None: - self._handle_timeout(runner=runner, time_limit=self.time_limit) - return True # Suppress the exception after handling - return False # Don't suppress other exceptions - - -class SolverArgsObserver(Observer): - - def __init__(self): - self.time_limit = None - self.mem_limit = None - self.seed = None - self.intermediate = False - self.cores = 1 - self.mem_limit = None - self.kwargs = dict() - - def observe_init(self, runner: Runner): - self.time_limit = runner.time_limit - self.mem_limit = runner.mem_limit - self.seed = runner.seed - self.intermediate = runner.intermediate - self.cores = runner.cores - self.mem_limit = runner.mem_limit - self.kwargs = runner.kwargs - - def _ortools_arguments( - self, - runner: Runner, - model: cp.Model, - cores: Optional[int] = None, - seed: Optional[int] = None, - intermediate: bool = False, - **kwargs - ): - # https://github.com/google/or-tools/blob/stable/ortools/sat/sat_parameters.proto - res = dict() - - # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 - res |= { - "interleave_search": True, - "use_rins_lns": False, - } - if not model.has_objective(): - res |= { "num_violation_ls": 1 } - - if cores is not None: - res |= { "num_search_workers": cores } - if seed is not None: - res |= { "random_seed": seed } - - if intermediate and model.has_objective(): - # Define custom ORT solution callback, then register it - _self = self - from ortools.sat.python import cp_model as ort - class OrtSolutionCallback(ort.CpSolverSolutionCallback): - """ - For intermediate objective printing. - """ - - def __init__(self): - super().__init__() - self.__start_time = time.time() - self.__solution_count = 1 - - def on_solution_callback(self): - """Called on each new solution.""" - - current_time = time.time() - obj = int(self.ObjectiveValue()) - _self.print_comment('Solution %i, time = %0.4fs' % - (self.__solution_count, current_time - self.__start_time)) - _self.observe_intermediate(runner=runner, objective=obj) - self.__solution_count += 1 - - - def solution_count(self): - """Returns the number of solutions found.""" - return self.__solution_count - - # Register the callback - res |= { "solution_callback": OrtSolutionCallback() } - - def internal_options(solver: "CPM_ortools"): - # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 - solver.ort_solver.parameters.subsolvers.extend(["default_lp", "max_lp", "quick_restart"]) - if not model.has_objective(): - solver.ort_solver.parameters.subsolvers.append("core_or_no_lp") - if len(solver.ort_model.proto.search_strategy) != 0: - solver.ort_solver.parameters.subsolvers.append("fixed") - - return res, internal_options - - def _exact_arguments( - self, - seed: Optional[int] = None, - **kwargs - ): - # Documentation: https://gitlab.com/JoD/exact/-/blob/main/src/Options.hpp?ref_type=heads - res = dict() - if seed is not None: - res |= { "seed": seed } - - return res, None - - def _choco_arguments(self): - # Documentation: https://github.com/chocoteam/pychoco/blob/master/pychoco/solver.py - return {}, None - - def _z3_arguments( - self, - model: cp.Model, - cores: int = 1, - seed: Optional[int] = None, - mem_limit: Optional[int] = None, - **kwargs - ): - # Documentation: https://microsoft.github.io/z3guide/programming/Parameters/ - # -> is outdated, just let it crash and z3 will report the available options - - res = dict() - - if model.has_objective(): - # Opt does not seem to support setting random seed or max memory - pass - else: - # Sat parameters - if cores is not None: - res |= { "threads": cores } # TODO what with hyperthreadding, when more threads than cores - if seed is not None: - res |= { "random_seed": seed } - if mem_limit is not None: - res |= { "max_memory": _bytes_as_mb(mem_limit) } - - return res, None - - def _minizinc_arguments( - self, - solver: str, - cores: Optional[int] = None, - seed: Optional[int] = None, - **kwargs - ): - # Documentation: https://minizinc-python.readthedocs.io/en/latest/api.html#minizinc.instance.Instance.solve - res = dict() - if cores is not None: - res |= { "processes": cores } - if seed is not None: - res |= { "random_seed": seed } - - #if solver.endswith("gecode"): - # Documentation: https://www.minizinc.org/doc-2.4.3/en/lib-gecode.html - #elif solver.endswith("chuffed"): - # Documentation: - # - https://www.minizinc.org/doc-2.5.5/en/lib-chuffed.html - # - https://github.com/chuffed/chuffed/blob/develop/chuffed/core/options.h - - return res, None - - def _gurobi_arguments( - self, - model: cp.Model, - cores: Optional[int] = None, - seed: Optional[int] = None, - mem_limit: Optional[int] = None, - intermediate: bool = False, - **kwargs - ): - # Documentation: https://www.gurobi.com/documentation/9.5/refman/parameters.html#sec:Parameters - res = dict() - if cores is not None: - res |= { "Threads": cores } - if seed is not None: - res |= { "Seed": seed } - if mem_limit is not None: - res |= { "MemLimit": _bytes_as_gb(mem_limit) } - - if intermediate and model.has_objective(): - - _self = self - - class GurobiSolutionCallback: - def __init__(self, model:cp.Model): - self.__start_time = time.time() - self.__solution_count = 0 - self.model = model - - def callback(self, *args, **kwargs): - current_time = time.time() - model, state = args - - # Callback codes: https://www.gurobi.com/documentation/current/refman/cb_codes.html#sec:CallbackCodes - - from gurobipy import GRB - # if state == GRB.Callback.MESSAGE: # verbose logging - # print_comment("log message: " + str(model.cbGet(GRB.Callback.MSG_STRING))) - if state == GRB.Callback.MIP: # callback from the MIP solver - if model.cbGet(GRB.Callback.MIP_SOLCNT) > self.__solution_count: # do we have a new solution? - - obj = int(model.cbGet(GRB.Callback.MIP_OBJBST)) - _self.print_comment('Solution %i, time = %0.4fs' % - (self.__solution_count, current_time - self.__start_time)) - _self.print_intermediate(obj) - self.__solution_count = model.cbGet(GRB.Callback.MIP_SOLCNT) - - res |= { "solution_callback": GurobiSolutionCallback(model).callback } - - return res, None - - def _cpo_arguments( - self, - model: cp.Model, - cores: Optional[int] = None, - seed: Optional[int] = None, - intermediate: bool = False, - **kwargs - ): - # Documentation: https://ibmdecisionoptimization.github.io/docplex-doc/cp/docplex.cp.parameters.py.html#docplex.cp.parameters.CpoParameters - res = dict() - if cores is not None: - res |= { "Workers": cores } - if seed is not None: - res |= { "RandomSeed": seed } - - if intermediate and model.has_objective(): - from docplex.cp.solver.solver_listener import CpoSolverListener - _self = self - class CpoSolutionCallback(CpoSolverListener): - - def __init__(self): - super().__init__() - self.__start_time = time.time() - self.__solution_count = 1 - - def result_found(self, solver, sres): - current_time = time.time() - obj = sres.get_objective_value() - if obj is not None: - _self.print_comment('Solution %i, time = %0.4fs' % - (self.__solution_count, current_time - self.__start_time)) - _self.print_intermediate(obj) - self.__solution_count += 1 - - def solution_count(self): - """Returns the number of solutions found.""" - return self.__solution_count - - # Register the callback - res |= { "solution_callback": CpoSolutionCallback } - - return res, None - - def _cplex_arguments( - self, - cores: Optional[int] = None, - seed: Optional[int] = None, - **kwargs - ): - res = dict() - if cores is not None: - res |= {"threads": cores} - if seed is not None: - res |= {"randomseed": seed} - - return res, None - - def _hexaly_arguments( - self, - model: cp.Model, - cores: Optional[int] = None, - seed: Optional[int] = None, - intermediate: bool = False, - **kwargs - ): - res = dict() - #res |= {"nb_threads": cores} - #res |= {"seed": seed} - - - if intermediate and model.has_objective(): - # Define custom Hexaly solution callback, then register it - - _self = self - class HexSolutionCallback: - - def __init__(self): - self.__start_time = time.time() - self.__solution_count = 0 - - - def on_solution_callback(self, optimizer, cb_type): - """Called on each new solution.""" - # check if solution with different objective (or if verbose) - current_time = time.time() - obj = optimizer.model.objectives[0] - _self.print_comment('Solution %i, time = %0.4fs' % - (self.__solution_count, current_time - self.__start_time)) - _self.print_intermediate(obj) - self.__solution_count += 1 - - def solution_count(self): - return self.__solution_count - - # Register the callback - res |= { "solution_callback": HexSolutionCallback().on_solution_callback } - - - # def internal_options(solver: "CPM_hexaly"): - # # https://github.com/google/or-tools/blob/1c5daab55dd84bca7149236e4b4fa009e5fd95ca/ortools/flatzinc/cp_model_fz_solver.cc#L1688 - # #solver.native_model.get_param().set_seed(seed) - # #solver.native_model.get_param().set_nr_threads(cores) - - # _self = self - # class CallbackExample: - # def __init__(self): - # self.last_best_value = 0 - # self.last_best_running_time = 0 - # self.__solution_count = 0 - # self.__start_time = time.time() - - # def my_callback(self, optimizer, cb_type): - # stats = optimizer.statistics - # obj = optimizer.model.objectives[0] - # current_time = time.time() - # #obj = int(self.ObjectiveValue()) - # #obj = optimizer.get_objective_bound(0).value - # if obj.value > self.last_best_value: - # self.last_best_running_time = stats.running_time - # self.last_best_value = obj.value - # self.__solution_count += 1 - - # _self.print_comment('Solution %i, time = %0.4fs' % - # (self.__solution_count, current_time - self.__start_time)) - # _self.print_intermediate(obj.value) - - # optimizer = solver.native_model - # cb = CallbackExample() - # from hexaly.optimizer import HxCallbackType - # optimizer.add_callback(HxCallbackType.TIME_TICKED, cb.my_callback) - - return res, None - - def _solver_arguments( - self, - runner: Runner, - solver: str, - model: cp.Model, - seed: Optional[int] = None, - intermediate: bool = False, - cores: int = 1, - mem_limit: Optional[int] = None, - **kwargs - ): - opt = model.has_objective() - sat = not opt - - if solver == "ortools": - return self._ortools_arguments(runner, model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) - elif solver == "exact": - return self._exact_arguments(seed=seed, **kwargs) - elif solver == "choco": - return self._choco_arguments() - elif solver == "z3": - return self._z3_arguments(model, cores=cores, seed=seed, mem_limit=mem_limit, **kwargs) - elif solver.startswith("minizinc"): # also can have a subsolver - return self._minizinc_arguments(solver, cores=cores, seed=seed, **kwargs) - elif solver == "gurobi": - return self._gurobi_arguments(model, cores=cores, seed=seed, mem_limit=mem_limit, intermediate=intermediate, opt=opt, **kwargs) - elif solver == "cpo": - return self._cpo_arguments(model=model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) - elif solver == "hexaly": - return self._hexaly_arguments(model, cores=cores, seed=seed, intermediate=intermediate, **kwargs) - elif solver == "cplex": - return self._cplex_arguments(cores=cores, **kwargs) - else: - runner.print_comment(f"setting parameters of {solver} is not (yet) supported") - return dict(), None - - def participate_solver_args(self, runner: Runner, solver_args: dict): - args, internal_options = self._solver_arguments(runner, runner.solver, model=runner.model, seed=self.seed, - intermediate=self.intermediate, - cores=self.cores, mem_limit=_mib_as_bytes(self.mem_limit) if self.mem_limit is not None else None, - **self.kwargs) - - if internal_options is not None: - internal_options(runner.s) - solver_args |= args - runner.print_comment(f"Solver arguments: {args}") - -class ProfilingObserver(Observer): - - def __init__(self): - self.start_time = None - self.end_time = None - self.start_transform_time = None - self.end_transform_time = None - - def observe_init(self, runner: Runner): - self.start_time = time.time() - - def observe_pre_transform(self, runner: Runner): - self.start_transform_time = time.time() - - def observe_post_transform(self, runner: Runner): - self.end_transform_time = time.time() - runner.print_comment(f"Time taken to transform: {self.end_transform_time - self.start_transform_time} seconds") - - def observe_post_solve(self, runner: Runner): - runner.print_comment(f"Time taken to solve: {runner.s.status().runtime} seconds") - - def observe_end(self, runner: Runner): - runner.print_comment(f"Total time taken: {time.time() - self.start_time} seconds") - -class SolutionCheckerObserver(Observer): - - def observe_end(self, runner: Runner): - runner.print_comment(f"Run solution checker here...") - -class WriteToFileObserver(Observer): - def __init__(self, file_path: str): - self.file_path = file_path - - def get_context_manager(self, runner: Runner): - """Return a context manager that redirects stdout to a file.""" - @contextlib.contextmanager - def redirect_to_file(): - with open(self.file_path, 'w') as f: - with contextlib.redirect_stdout(f): - yield - return redirect_to_file() - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("instance", type=str) - parser.add_argument("--verbose", action="store_true", default=False) - parser.add_argument("--solver", type=str, default="ortools") - parser.add_argument("--time_limit", type=int, default=None) - parser.add_argument("--mem_limit", type=int, default=None) - parser.add_argument("--seed", type=int, default=None) - parser.add_argument("--intermediate", action="store_true", default=False) - parser.add_argument("--cores", type=int, default=None) - parser.add_argument("--output_file", type=str, default=None) - # parser.add_argument("--kwargs", type=str, default="") - - args = parser.parse_args() - - - if args.output_file is None: - args.output_file = f"results/{args.solver}_{args.instance}.txt" - else: - args.output_file = f"results/{args.output_file}" - - Path(args.output_file).parent.mkdir(parents=True, exist_ok=True) - - - from cpmpy.tools.rcpsp import read_rcpsp - from cpmpy.tools.dataset.problem.psplib import PSPLibDataset - dataset = PSPLibDataset(root="./data", download=True) - - runner = Runner(reader=partial(read_rcpsp, open=dataset.open)) - # runner.register_observer(LoggerObserver()) - runner.register_observer(CompetitionPrintingObserver()) - runner.register_observer(ProfilingObserver()) - # runner.register_observer(ResourceLimitObserver(time_limit=args.time_limit, mem_limit=args.mem_limit)) - runner.register_observer(HandlerObserver()) - runner.register_observer(SolverArgsObserver()) - runner.register_observer(SolutionCheckerObserver()) - runner.register_observer(WriteToFileObserver(file_path=args.output_file)) - print(vars(args)) - runner.run(**vars(args)) - -if __name__ == "__main__": - main() - - # from cpmpy.tools.dataset.model.xcsp3 import XCSP3Dataset - # from cpmpy.tools.xcsp3 import read_xcsp3 - - # from cpmpy.tools.dataset.model.opb import OPBDataset - # from cpmpy.tools.opb import read_opb - - # from cpmpy.tools.dataset.problem.jsplib import JSPLibDataset - # from cpmpy.tools.jsplib import read_jsplib - - # from cpmpy.tools.dataset.problem.psplib import PSPLibDataset - # from cpmpy.tools.rcpsp import read_rcpsp - - # # dataset = XCSP3Dataset(root="./data", year=2025, track="CSP25", download=True) - # dataset = OPBDataset(root="./data", year=2024, track="DEC-LIN", download=True) - # dataset = JSPLibDataset(root="./data", download=True) - # dataset = PSPLibDataset(root="./data", download=True) - - # for instance, metadata in dataset: - # print(instance, metadata) - # runner = Runner(reader=partial(read_rcpsp, open=dataset.open)) - # #runner.register_observer(LoggerObserver()) - # runner.register_observer(CompetitionPrintingObserver()) - # runner.register_observer(ProfilingObserver()) - # #runner.register_observer(ResourceLimitObserver(time_limit=10, mem_limit=1024)) - # runner.register_observer(HandlerObserver()) - # runner.register_observer(SolverArgsObserver()) - # runner.register_observer(SolutionCheckerObserver()) - # runner.run(instance, solver="ortools") - - # break \ No newline at end of file diff --git a/cpmpy/tools/benchmark/test/xcsp3_instance_runner.py b/cpmpy/tools/benchmark/test/xcsp3_instance_runner.py deleted file mode 100644 index 4c45d1e89..000000000 --- a/cpmpy/tools/benchmark/test/xcsp3_instance_runner.py +++ /dev/null @@ -1,66 +0,0 @@ -from functools import partial -import lzma -from pathlib import Path -from cpmpy.tools.benchmark.test.instance_runner import InstanceRunner -import os, sys - -from cpmpy.tools.benchmark.test.runner import CompetitionPrintingObserver, HandlerObserver, ProfilingObserver, ResourceLimitObserver, Runner, SolverArgsObserver, SolutionCheckerObserver -from cpmpy.tools.xcsp3.parser import read_xcsp3 - -class XCSP3InstanceRunner(InstanceRunner): - - this_file_path = os.path.abspath(__file__) - this_python = sys.executable - - def cmd(self, instance: str, solver: str = "ortools", output_file: str = None, **kwargs): - cmd = [ - self.this_python, - self.this_file_path, - instance, - ] - if solver is not None: - cmd.append("--solver") - cmd.append(solver) - if output_file is not None: - cmd.append("--output_file") - cmd.append(output_file) - return cmd - - def print_comment(self, comment: str): - print('c' + chr(32) + comment.rstrip('\n'), end="\r\n", flush=True) - - def run(self, instance: str, solver: str = "ortools", output_file: str = None, **kwargs): - - if output_file is None: - output_file = f"results/{solver}_{instance}.txt" - else: - output_file = f"results/{output_file}" - - Path(output_file).parent.mkdir(parents=True, exist_ok=True) - - runner = Runner(reader=partial(read_xcsp3, open= lambda instance: lzma.open(instance, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open(instance))) - - runner.register_observer(CompetitionPrintingObserver()) - runner.register_observer(ProfilingObserver()) - runner.register_observer(HandlerObserver()) - runner.register_observer(SolverArgsObserver()) - runner.register_observer(SolutionCheckerObserver()) - runner.register_observer(ResourceLimitObserver()) # Don't enforce any limits, just observe / capture exceptions - - # Register any additional observers that were added programmatically - for observer in self.get_additional_observers(): - runner.register_observer(observer) - - runner.run(instance=instance, solver=solver, output_file=output_file, **kwargs) - - -def main(): - runner = XCSP3InstanceRunner() - - parser = runner.argparser() - args = parser.parse_args() - - runner.run(**vars(args)) - -if __name__ == "__main__": - main() \ No newline at end of file From 804c96ffe7bb6cfaff328f23fc31c6d774429a21 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 4 Feb 2026 13:01:53 +0100 Subject: [PATCH 095/152] setup command --- cpmpy/tools/benchmark/test/manager.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpmpy/tools/benchmark/test/manager.py b/cpmpy/tools/benchmark/test/manager.py index 19d507444..4cb468548 100644 --- a/cpmpy/tools/benchmark/test/manager.py +++ b/cpmpy/tools/benchmark/test/manager.py @@ -8,6 +8,7 @@ import warnings import logging from pathlib import Path +from typing import Optional, List from cpmpy.tools.benchmark import _mib_as_bytes from cpmpy.tools.benchmark.test.instance_runner import InstanceRunner @@ -232,6 +233,7 @@ def run(self, intermediate: bool, verbose: bool, output_file: str, + setup_command: Optional[List[str]] = None, ) -> dict: """ Run a single instance with assigned resources. @@ -292,6 +294,11 @@ def signal_handler_kill(signum, frame): #"--cores", str(len(cores)) # Pass number of cores to the solver ] + # Prepend setup_command if provided - this wraps the entire command invocation + # e.g., systemd-run --user --scope --slice=benchexec -p Delegate=yes python script.py --args + if setup_command: + cmd = list(setup_command) + cmd + result = executor.execute_run( args=cmd, output_filename=tmp_filename, # Capture subprocess output to temp file @@ -401,7 +408,7 @@ def run(self, -def run_instance(instance: str, instance_runner: InstanceRunner, time_limit: int, memory_limit: int, cores: list[int], resource_manager: ResourceManager, solver: str, seed: int, intermediate: bool, verbose: bool, output_file: str): +def run_instance(instance: str, instance_runner: InstanceRunner, time_limit: int, memory_limit: int, cores: list[int], resource_manager: ResourceManager, solver: str, seed: int, intermediate: bool, verbose: bool, output_file: str, setup_command=None): """ Run a single instance with assigned cores. @@ -411,10 +418,11 @@ def run_instance(instance: str, instance_runner: InstanceRunner, time_limit: int time_limit: Time limit in seconds memory_limit: Memory limit in MB cores: List of core IDs to assign to this run (e.g., [0, 1] for cores 0 and 1) + setup_command: Optional command to prefix before running (list of strings) """ - resource_manager.run(instance, instance_runner, time_limit, memory_limit, cores, solver, seed, intermediate, verbose, output_file) + resource_manager.run(instance, instance_runner, time_limit, memory_limit, cores, solver, seed, intermediate, verbose, output_file, setup_command) # Convert cores list to comma-separated string for runexec From 71802144e1b3e7d79c88c71d4d02c2ad155f8fdd Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 4 Feb 2026 14:05:48 +0100 Subject: [PATCH 096/152] fix for pinac --- cpmpy/tools/benchmark/test/manager.py | 78 +++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/cpmpy/tools/benchmark/test/manager.py b/cpmpy/tools/benchmark/test/manager.py index 4cb468548..30c67cd85 100644 --- a/cpmpy/tools/benchmark/test/manager.py +++ b/cpmpy/tools/benchmark/test/manager.py @@ -7,6 +7,8 @@ import contextlib import warnings import logging +import secrets +import subprocess from pathlib import Path from typing import Optional, List @@ -16,6 +18,79 @@ from cpmpy.tools.benchmark.test.observer import ResourceLimitObserver +def _ensure_systemd_scope(): + """ + Ensure the current process is in its own systemd scope with cgroup delegation. + + This is required for BenchExec's RunExecutor to work properly with cgroups v2. + When running under a parent systemd scope (e.g., via systemd-run), child processes + need their own scope to enable cgroup subtree delegation. + + Uses busctl to call systemd's D-Bus API directly, avoiding the need for pystemd. + + Returns True if successful or already in a suitable scope, False otherwise. + """ + # Check if we're already in our own benchexec scope (to avoid re-creating) + try: + with open("/proc/self/cgroup", "r") as f: + cgroup_info = f.read() + if "benchexec_worker_" in cgroup_info: + logging.debug("Already in a benchexec worker scope") + return True + except Exception: + pass + + # Create a new transient scope for this process + random_suffix = secrets.token_urlsafe(8) + scope_name = f"benchexec_worker_{random_suffix}.scope" + + try: + # Use busctl to create a transient scope unit + # This is equivalent to what pystemd does but using command line + cmd = [ + "busctl", "--user", "call", + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + "StartTransientUnit", + "ssa(sv)a(sa(sv))", + scope_name, # unit name + "fail", # mode + "3", # number of properties + "PIDs", "au", "1", str(os.getpid()), # Add current PID + "Delegate", "b", "true", # Enable delegation + "Slice", "s", "benchexec.slice", # Put in benchexec slice + "0" # no auxiliary units + ] + + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=10 + ) + + if result.returncode == 0: + logging.debug(f"Created systemd scope: {scope_name}") + # Give systemd a moment to move the process + import time + time.sleep(0.1) + return True + else: + logging.warning(f"Failed to create systemd scope: {result.stderr}") + return False + + except FileNotFoundError: + logging.warning("busctl not found, cannot create systemd scope") + return False + except subprocess.TimeoutExpired: + logging.warning("Timeout creating systemd scope") + return False + except Exception as e: + logging.warning(f"Error creating systemd scope: {e}") + return False + + class ResourceManager: """ Abstract base class for resource managers. @@ -261,6 +336,9 @@ def run(self, _runner.print_comment(f"Running instance {instance} with time limit {time_limit} and memory limit {memory_limit} and cores {cores}") _runner.print_comment(f"Running with manager {self.__class__.__name__}") + # Ensure we're in our own systemd scope for cgroup delegation (required for cgroups v2) + _ensure_systemd_scope() + from benchexec.runexecutor import RunExecutor # Use a temporary file to capture subprocess output, then forward it From ad0d06188120d37e6bf178a9e4dc39c5af14aaf4 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 11 Feb 2026 21:46:40 +0100 Subject: [PATCH 097/152] update to new IO location --- cpmpy/tools/benchmark/jsplib.py | 2 +- cpmpy/tools/benchmark/mse.py | 2 +- cpmpy/tools/benchmark/nurserostering.py | 2 +- cpmpy/tools/benchmark/psplib.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpmpy/tools/benchmark/jsplib.py b/cpmpy/tools/benchmark/jsplib.py index 343c2dfdd..999ea148d 100644 --- a/cpmpy/tools/benchmark/jsplib.py +++ b/cpmpy/tools/benchmark/jsplib.py @@ -50,7 +50,7 @@ # CPMpy from cpmpy.tools.benchmark.runner import benchmark_runner from cpmpy.tools.benchmark._base import Benchmark, ExitStatus -from cpmpy.tools.jsplib import read_jsplib +from cpmpy.tools.io.jsplib import read_jsplib from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus diff --git a/cpmpy/tools/benchmark/mse.py b/cpmpy/tools/benchmark/mse.py index a1936346e..ffe90cc5d 100644 --- a/cpmpy/tools/benchmark/mse.py +++ b/cpmpy/tools/benchmark/mse.py @@ -53,7 +53,7 @@ # CPMpy from cpmpy.tools.benchmark.runner import benchmark_runner from cpmpy.tools.benchmark._base import Benchmark -from cpmpy.tools.wcnf import read_wcnf +from cpmpy.tools.io.wcnf import read_wcnf from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus diff --git a/cpmpy/tools/benchmark/nurserostering.py b/cpmpy/tools/benchmark/nurserostering.py index 31976f229..090eb1b03 100644 --- a/cpmpy/tools/benchmark/nurserostering.py +++ b/cpmpy/tools/benchmark/nurserostering.py @@ -48,7 +48,7 @@ # CPMpy from cpmpy.tools.benchmark.runner import benchmark_runner from cpmpy.tools.benchmark._base import Benchmark, ExitStatus -from cpmpy.tools.nurserostering import read_nurserostering +from cpmpy.tools.io.nurserostering import read_nurserostering from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus diff --git a/cpmpy/tools/benchmark/psplib.py b/cpmpy/tools/benchmark/psplib.py index 0f1a1639f..0fcf53950 100644 --- a/cpmpy/tools/benchmark/psplib.py +++ b/cpmpy/tools/benchmark/psplib.py @@ -51,7 +51,7 @@ # CPMpy from cpmpy.tools.benchmark.runner import benchmark_runner from cpmpy.tools.benchmark._base import Benchmark, ExitStatus -from cpmpy.tools.rcpsp import read_rcpsp +from cpmpy.tools.io.rcpsp import read_rcpsp from cpmpy.solvers.solver_interface import ExitStatus as CPMStatus From 7a48b1e25aaf654571781b82f8d1f55052013b47 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 12 Feb 2026 17:07:21 +0100 Subject: [PATCH 098/152] Update writers --- cpmpy/tools/io/opb.py | 48 ++++++++++++++++++++---- cpmpy/tools/io/scip.py | 80 ++++++++++++++++++++++++++++++---------- cpmpy/tools/io/writer.py | 36 ++++++++++++++++++ 3 files changed, 137 insertions(+), 27 deletions(-) diff --git a/cpmpy/tools/io/opb.py b/cpmpy/tools/io/opb.py index ef32be718..af2ad7c74 100644 --- a/cpmpy/tools/io/opb.py +++ b/cpmpy/tools/io/opb.py @@ -37,10 +37,11 @@ from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective from cpmpy.transformations.reification import only_implies, only_bv_reifies from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv_wsum -from cpmpy.transformations.int2bool import int2bool +from cpmpy.transformations.int2bool import int2bool, _encode_int_var, _decide_encoding from cpmpy.transformations.get_variables import get_variables from cpmpy.expressions.variables import _IntVarImpl, NegBoolView, _BoolVarImpl from cpmpy.expressions.core import Operator, Comparison +from cpmpy.expressions.utils import is_num from cpmpy import __version__ @@ -200,7 +201,7 @@ def read_opb(opb: Union[str, os.PathLike], open=open) -> cp.Model: return model -def write_opb(model, fname=None, encoding="auto"): +def write_opb(model, fname=None, encoding="auto", header=None): """ Export a CPMpy model to the OPB (Pseudo-Boolean) format. @@ -213,6 +214,8 @@ def write_opb(model, fname=None, encoding="auto"): model (cp.Model): The CPMpy model to export. fname (str, optional): The file name to write the OPB output to. If None, the OPB string is returned. encoding (str, optional): The encoding used for `int2bool`. Options: ("auto", "direct", "order", "binary"). + header (str, optional): Optional header text to add as OPB comments. If provided, each line + will be prefixed with "* ". Returns: str or None: The OPB string if `fname` is None, otherwise nothing (writes to file). @@ -241,7 +244,7 @@ def write_opb(model, fname=None, encoding="auto"): if model.objective_ is not None: opb_obj, const, extra_cons = _transform_objective(model.objective_, csemap, ivarmap, encoding) - opb_cons += extra_cons + opb_cons += _transform(extra_cons, csemap, ivarmap, encoding) else: opb_obj = None @@ -252,6 +255,9 @@ def write_opb(model, fname=None, encoding="auto"): f"* #variable= {len(all_vars)} #constraint= {len(opb_cons)}", f"* OPB file generated by CPMpy version {__version__}", ] + if header: + header_lines = ["* " + line for line in str(header).splitlines()] + out.extend(header_lines) # Remap variables to 'x1', 'x2', ..., the standard OPB way varmap = {v: f"x{i+1}" for i, v in enumerate(all_vars)} @@ -292,9 +298,11 @@ def _normalized_comparison(lst_of_expr): """ newlist = [] for cpm_expr in lst_of_expr: - if isinstance(cpm_expr, cp.BoolVal) and cpm_expr.value() is False: - raise NotImplementedError(f"Cannot transform {cpm_expr} to OPB constraint") - + if isinstance(cpm_expr, cp.BoolVal): + if cpm_expr.value() is False: + raise NotImplementedError(f"Cannot transform {cpm_expr} to OPB constraint") + continue # trivially True, skip + # single Boolean variable if isinstance(cpm_expr, _BoolVarImpl): cpm_expr = Operator("sum", [cpm_expr]) >= 1 @@ -311,7 +319,7 @@ def _normalized_comparison(lst_of_expr): if isinstance(cpm_expr, Comparison): lhs, rhs = cpm_expr.args - if isinstance(lhs, _BoolVarImpl): + if isinstance(lhs, (_BoolVarImpl, _IntVarImpl)): lhs = Operator("sum", [lhs]) if lhs.name == "sum": lhs = Operator("wsum", [[1]*len(lhs.args), lhs.args]) @@ -414,6 +422,32 @@ def _transform_objective(expr, csemap, ivarmap, encoding="auto"): return obj, const, safe_cons + decomp_cons + flat_cons +def _encode_lin_expr(ivarmap, xs, weights, encoding="auto"): + """ + Encode a linear expression (weights * xs) to PB terms and domain constraints. + + Returns: + (terms, constraints, k) + """ + terms = [] + constraints = [] + k = 0 + + for w, x in zip(weights, xs): + if is_num(x): + k += w * x + elif isinstance(x, _BoolVarImpl): + terms.append((w, x)) + else: + enc, cons = _encode_int_var(ivarmap, x, _decide_encoding(x, None, encoding)) + constraints += cons + new_terms, k_i = enc.encode_term(w) + terms += new_terms + k += k_i + + return terms, constraints, k + + def main(): parser = argparse.ArgumentParser(description="Parse and solve an OPB model using CPMpy") parser.add_argument("model", help="Path to an OPB file (or raw OPB string if --string is given)") diff --git a/cpmpy/tools/io/scip.py b/cpmpy/tools/io/scip.py index 1f32dc6e5..3f9d7086b 100644 --- a/cpmpy/tools/io/scip.py +++ b/cpmpy/tools/io/scip.py @@ -44,15 +44,24 @@ from cpmpy.expressions.core import BoolVal, Comparison, Operator from cpmpy.expressions.variables import _NumVarImpl, _BoolVarImpl, NegBoolView, _IntVarImpl from cpmpy.transformations.comparison import only_numexpr_equality -from cpmpy.transformations.decompose_global import decompose_in_tree +from cpmpy.transformations.decompose_global import decompose_in_tree, decompose_objective from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective from cpmpy.transformations.get_variables import get_variables -from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv +from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv, only_positive_bv_wsum from cpmpy.transformations.normalize import toplevel_list -from cpmpy.transformations.reification import only_implies, reify_rewrite +from cpmpy.transformations.reification import only_bv_reifies, only_implies, reify_rewrite from cpmpy.expressions.utils import is_any_list, is_num from cpmpy.expressions.globalconstraints import DirectConstraint # from cpmpy.expressions.variables import ignore_variable_name_check +from cpmpy.transformations.safening import no_partial_functions, safen_objective + +try: + from cpmpy.expressions.variables import _ignore_variable_name_check +except ImportError: + from contextlib import contextmanager + @contextmanager + def _ignore_variable_name_check(): + yield _std_open = open @@ -181,6 +190,14 @@ class _SCIPWriter: TODO: code should be reused once SCIP has been added as a solver backend. """ + # Globals we keep (decompose_in_tree) and how they are translated: + # - "xor": kept; linearize passes it through; we translate to addConsXor() in add(). + # - "abs": GlobalFunction supported natively (PySCIPOpt addCons(abs(x) <= k)). + # SCIP has no native AllDifferent, Circuit, Table, Cumulative, etc.; others are decomposed by decompose_in_tree. + supported_global_constraints = frozenset({"xor", "abs"}) + supported_reified_global_constraints = frozenset() + + @staticmethod def supported(): # try to import the package @@ -249,17 +266,23 @@ def objective(self, expr, minimize=True): are premanently posted to the solver) """ - # make objective function non-nested - (flat_obj, flat_cons) = (flatten_objective(expr)) - self += flat_cons - get_variables(flat_obj, collect=self.user_vars) # add potentially created constraints + get_variables(expr, collect=self.user_vars) + + obj, safe_cons = safen_objective(expr) + obj, decomp_cons = decompose_objective(obj, + supported=self.supported_global_constraints, + supported_reified=self.supported_reified_global_constraints, + csemap=self._csemap) + obj, flat_cons = flatten_objective(obj, csemap=self._csemap) + obj = only_positive_bv_wsum(obj) - # make objective function or variable and post - obj = self._make_numexpr(flat_obj) + self.add(safe_cons + decomp_cons + flat_cons) + + scip_obj = self._make_numexpr(obj) if minimize: - self.scip_model.setObjective(obj, sense='minimize') + self.scip_model.setObjective(scip_obj, sense='minimize') else: - self.scip_model.setObjective(obj, sense='maximize') + self.scip_model.setObjective(scip_obj, sense='maximize') def _make_numexpr(self, cpm_expr): @@ -307,15 +330,22 @@ def transform(self, cpm_expr): """ # apply transformations, then post internally # expressions have to be linearized to fit in MIP model. See /transformations/linearize + + _csemap = {} + cpm_cons = toplevel_list(cpm_expr) - supported = {"alldifferent"} # alldiff has a specialized MIP decomp in linearize - cpm_cons = decompose_in_tree(cpm_cons, supported) - cpm_cons = flatten_constraint(cpm_cons) # flat normal form - cpm_cons = reify_rewrite(cpm_cons, supported=frozenset(['sum', 'wsum','sub'])) # constraints that support reification - cpm_cons = only_numexpr_equality(cpm_cons, supported=frozenset(["sum", "wsum", "sub"])) # supports >, <, != - cpm_cons = only_implies(cpm_cons) # anything that can create full reif should go above... - cpm_cons = linearize_constraint(cpm_cons, supported=frozenset({"sum", "wsum","sub", "mul", "div"})) # the core of the MIP-linearization - cpm_cons = only_positive_bv(cpm_cons) # after linearization, rewrite ~bv into 1-bv + cpm_cons = no_partial_functions(cpm_cons, safen_toplevel={"mod", "div", "element"}) + cpm_cons = decompose_in_tree(cpm_cons, + supported=self.supported_global_constraints | {"alldifferent"}, + supported_reified=self.supported_reified_global_constraints, + csemap=self._csemap) + cpm_cons = flatten_constraint(cpm_cons, csemap=self._csemap) + cpm_cons = reify_rewrite(cpm_cons, supported=frozenset(['sum', 'wsum', 'sub']), csemap=self._csemap) + cpm_cons = only_numexpr_equality(cpm_cons, supported=frozenset(["sum", "wsum", "sub"]) | self.supported_global_constraints, csemap=self._csemap) + cpm_cons = only_bv_reifies(cpm_cons, csemap=self._csemap) + cpm_cons = only_implies(cpm_cons, csemap=self._csemap) + cpm_cons = linearize_constraint(cpm_cons, supported=frozenset({"sum", "wsum", "sub", "mul", "div", "sum!=", "wsum!="}) | self.supported_global_constraints, csemap=self._csemap) + cpm_cons = only_positive_bv(cpm_cons, csemap=self._csemap) return cpm_cons def _get_constraint_name(self): @@ -542,7 +572,17 @@ def write_scip(model: cp.Model, fname: Optional[str] = None, format: str = "mps" with tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False) as tmp: fname = tmp.name try: - writer.scip_model.writeProblem(fname) + writer.scip_model.hideOutput() + # Suppress SCIP's C-level "wrote problem to file" message + devnull = os.open(os.devnull, os.O_WRONLY) + old_stdout = os.dup(1) + os.dup2(devnull, 1) + try: + writer.scip_model.writeProblem(fname) + finally: + os.dup2(old_stdout, 1) + os.close(devnull) + os.close(old_stdout) _add_header(fname, format, header) with open(fname, "r") as f: return f.read() diff --git a/cpmpy/tools/io/writer.py b/cpmpy/tools/io/writer.py index c4074e22d..59c6029de 100644 --- a/cpmpy/tools/io/writer.py +++ b/cpmpy/tools/io/writer.py @@ -45,6 +45,42 @@ # "wcnf": write_wcnf, # currently not supported } +# Maps each format to the external packages its writer depends on. +# Used by writer_dependencies() to record provenance in sidecar metadata. +_writer_deps = { + "mps": ["pyscipopt"], + "lp": ["pyscipopt"], + "cip": ["pyscipopt"], + "fzn": ["pyscipopt"], + "gms": ["pyscipopt"], + "pip": ["pyscipopt"], + "dimacs": ["pindakaas"], + "opb": [], +} + + +def writer_dependencies(format: str) -> dict: + """Return a dict of ``{package_name: version}`` for the writer's external deps. + + Arguments: + format: target format name (e.g., ``"mps"``, ``"dimacs"``, ``"opb"``). + + Returns: + dict mapping package names to installed version strings. + Packages that are not installed are omitted. + """ + from importlib.metadata import version, PackageNotFoundError + + deps = _writer_deps.get(format, []) + result = {} + for pkg in deps: + try: + result[pkg] = version(pkg) + except PackageNotFoundError: + pass + return result + + def _get_writer(format: str) -> Callable: """ Get the writer function for a given format. From 0baaeb2005ae76ae09ac4c87399e652806b1e634 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 12 Feb 2026 17:09:00 +0100 Subject: [PATCH 099/152] fix import --- cpmpy/solvers/cpo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpmpy/solvers/cpo.py b/cpmpy/solvers/cpo.py index 80692d6f4..9a38e99f9 100644 --- a/cpmpy/solvers/cpo.py +++ b/cpmpy/solvers/cpo.py @@ -46,7 +46,7 @@ import warnings from .solver_interface import SolverInterface, SolverStatus, ExitStatus, Callback -from .. import DirectConstraint +from ..expressions.globalconstraints import DirectConstraint from ..expressions.core import Expression, Comparison, Operator, BoolVal from ..expressions.globalconstraints import GlobalConstraint from ..expressions.globalfunctions import GlobalFunction From 017e1a29f51945cb286d77c6bee09ff518a288c6 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 12 Feb 2026 17:12:04 +0100 Subject: [PATCH 100/152] experimental metadata collection --- cpmpy/tools/dataset/_base.py | 570 ++++++++++++++++++++++++-- cpmpy/tools/dataset/jsplib.py | 161 +++++--- cpmpy/tools/dataset/miplib.py | 61 ++- cpmpy/tools/dataset/mse.py | 53 ++- cpmpy/tools/dataset/nurserostering.py | 40 +- cpmpy/tools/dataset/opb.py | 56 ++- cpmpy/tools/dataset/psplib.py | 77 +++- cpmpy/tools/dataset/xcsp3.py | 48 ++- 8 files changed, 953 insertions(+), 113 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index 3bf91076d..b8bc38e67 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -2,18 +2,21 @@ Dataset Base Class This module defines the abstract `_Dataset` class, which serves as the foundation -for loading and managing benchmark instance collections in CPMpy-based experiments. +for loading and managing benchmark instance collections in CPMpy-based experiments. It standardizes how datasets are stored, accessed, and optionally transformed. """ from abc import ABC, abstractmethod +import json import os import pathlib import io import tempfile -from typing import Any, Optional, Tuple +import warnings +from typing import Any, Optional, Tuple, List from urllib.error import URLError from urllib.request import HTTPError, Request, urlopen +from concurrent.futures import ThreadPoolExecutor, as_completed def format_bytes(bytes_num): """ @@ -29,6 +32,103 @@ def format_bytes(bytes_num): except ImportError: tqdm = None + +# Fields produced by extract_model_features() — not portable across format translations +_MODEL_FEATURE_FIELDS = frozenset({ + "num_variables", "num_bool_variables", "num_int_variables", + "num_constraints", "constraint_types", "has_objective", + "objective_type", "domain_size_min", "domain_size_max", "domain_size_mean", +}) + +# Prefixes for format-specific metadata fields (not portable across translations) +_FORMAT_SPECIFIC_PREFIXES = ("opb_", "wcnf_", "mps_", "xcsp_", "dimacs_") + + +def portable_instance_metadata(metadata: dict) -> dict: + """Filter sidecar metadata to only portable, domain-specific fields. + + Strips model features (num_variables, constraint_types, ...), + format-specific fields (opb_*, wcnf_*, mps_*, ...), and internal + error fields (starting with ``_``). + + Keeps domain-specific metadata that is independent of the file format, + such as ``jobs``, ``machines``, ``optimum``, ``horizon``, ``bounds``, etc. + + Arguments: + metadata (dict): Full sidecar metadata dictionary. + + Returns: + dict with only portable fields. + """ + return { + k: v for k, v in metadata.items() + if not k.startswith("_") + and k not in _MODEL_FEATURE_FIELDS + and not any(k.startswith(p) for p in _FORMAT_SPECIFIC_PREFIXES) + } + + +def extract_model_features(model) -> dict: + """ + Extract generic CP features from a CPMpy Model. + + Arguments: + model: a cpmpy.Model instance + + Returns: + dict with keys: num_variables, num_bool_variables, num_int_variables, + num_constraints, constraint_types, has_objective, objective_type, + domain_size_min, domain_size_max, domain_size_mean + """ + from cpmpy.transformations.get_variables import get_variables_model + from cpmpy.expressions.variables import _BoolVarImpl + from cpmpy.expressions.core import Expression + from cpmpy.expressions.utils import is_any_list + + variables = get_variables_model(model) + + num_bool = sum(1 for v in variables if isinstance(v, _BoolVarImpl)) + num_int = len(variables) - num_bool + + # Domain sizes (lb/ub available on all variable types) + domain_sizes = [int(v.ub) - int(v.lb) + 1 for v in variables] if variables else [] + + # Constraint types: collect .name from top-level constraints + constraint_type_counts = {} + + def _count_constraints(c): + if is_any_list(c): + for sub in c: + _count_constraints(sub) + elif isinstance(c, Expression): + name = c.name + constraint_type_counts[name] = constraint_type_counts.get(name, 0) + 1 + + for c in model.constraints: + _count_constraints(c) + + num_constraints = sum(constraint_type_counts.values()) + + # Objective + has_obj = model.objective_ is not None + obj_type = "none" + if has_obj: + obj_type = "min" if model.objective_is_min else "max" + + return { + "num_variables": len(variables), + "num_bool_variables": num_bool, + "num_int_variables": num_int, + "num_constraints": num_constraints, + "constraint_types": constraint_type_counts, + "has_objective": has_obj, + "objective_type": obj_type, + "domain_size_min": min(domain_sizes) if domain_sizes else None, + "domain_size_max": max(domain_sizes) if domain_sizes else None, + "domain_size_mean": round(sum(domain_sizes) / len(domain_sizes), 2) if domain_sizes else None, + } + + class _Dataset(ABC): """ Abstract base class for PyTorch-style datasets of benchmarking instances. @@ -37,10 +137,26 @@ class _Dataset(ABC): accessing benchmark instances. This class should not be used on its own. """ + # Extension for metadata sidecar files + METADATA_EXTENSION = ".meta.json" + + # Dataset-level metadata (override in subclasses) + description = "" + url = "" + license = "" + citation = "" + domain = "" + format = "" + reader = None # callable(file_path, open=open) -> cp.Model + + # Multiple download origins (override in subclasses or via config) + # Origins are tried in order, falling back to original url if all fail + origins: List[str] = [] # List of URL bases to try before falling back to original url + def __init__( - self, + self, dataset_dir: str = ".", - transform=None, target_transform=None, + transform=None, target_transform=None, download: bool = False, extension:str=".txt", **kwargs @@ -55,18 +171,35 @@ def __init__( raise ValueError(f"Dataset not found. Please set download=True to download the dataset.") else: self.download() - files = sorted(list(self.dataset_dir.rglob(f"*{self.extension}"))) + self._collect_all_metadata() + files = self._list_instances() print(f"Finished downloading {len(files)} instances") - files = sorted(list(self.dataset_dir.rglob(f"*{self.extension}"))) + # Generate sidecar metadata for existing datasets that lack them + self._collect_all_metadata() + + files = self._list_instances() if len(files) == 0: raise ValueError(f"Cannot find any instances inside dataset {self.dataset_dir}. Is it a valid dataset? If so, please report on GitHub.") - + + @classmethod + def dataset_metadata(cls) -> dict: + """Return dataset-level metadata as a dictionary.""" + return { + "name": cls.name, + "description": cls.description, + "url": cls.url, + "license": cls.license, + "citation": cls.citation, + "domain": cls.domain, + "format": cls.format, + } + @abstractmethod def category(self) -> dict: """ Labels to distinguish instances into categories matching to those of the dataset. - E.g. + E.g. - year - track """ @@ -79,10 +212,242 @@ def download(self, *args, **kwargs): """ pass + def _list_instances(self) -> list: + """ + List all instance files, excluding metadata sidecar files. + + Returns a sorted list of pathlib.Path objects for all instance files + matching the dataset's extension pattern. + """ + return sorted([ + f for f in self.dataset_dir.rglob(f"*{self.extension}") + if f.is_file() and not str(f).endswith(self.METADATA_EXTENSION) + ]) + + def _metadata_path(self, instance_path) -> pathlib.Path: + """Return the path to the .meta.json sidecar file for a given instance.""" + return pathlib.Path(str(instance_path) + self.METADATA_EXTENSION) + + def collect_instance_metadata(self, file) -> dict: + """ + Override in subclass to provide domain-specific instance metadata. + Called once after download for each instance. + + Arguments: + file: path to the instance file + + Returns: + dict with instance-specific metadata fields + """ + return {} + + def _collect_all_metadata(self, force=False): + """Collect and store structured metadata sidecar files for all instances. + + Writes a structured ``.meta.json`` sidecar alongside each instance with: + + - ``dataset``: dataset-level metadata (name, description, url, ...) + - ``instance_name``: logical instance name (filename stem) + - ``source_file``: path to the instance file + - ``category``: dataset category labels (year, track, variant, family) + - ``instance_metadata``: portable domain-specific metadata + - ``format_metadata``: format-specific metadata from the source format + + Arguments: + force (bool): If True, re-collect instance metadata even if sidecar + files already exist. + """ + files = self._list_instances() + + # Filter files that need processing + files_to_process = [] + for file_path in files: + meta_path = self._metadata_path(file_path) + if force or not meta_path.exists(): + files_to_process.append(file_path) + else: + # Upgrade old flat sidecars to structured format + try: + with open(meta_path, "r") as f: + existing = json.load(f) + if not isinstance(existing.get("dataset"), dict): + files_to_process.append(file_path) + except (json.JSONDecodeError, IOError): + files_to_process.append(file_path) + + if not files_to_process: + return + + # Use tqdm for progress if available + if tqdm is not None: + file_iter = tqdm(files_to_process, desc="Collecting metadata", unit="file") + else: + file_iter = files_to_process + print(f"Collecting metadata for {len(files_to_process)} instances...") + + for file_path in file_iter: + meta_path = self._metadata_path(file_path) + try: + instance_meta = self.collect_instance_metadata(str(file_path)) + except Exception as e: + instance_meta = {"_metadata_error": str(e)} + + # Separate portable from format-specific fields + portable = portable_instance_metadata(instance_meta) + format_specific = { + k: v for k, v in instance_meta.items() + if k not in portable and not k.startswith("_") + } + + # Derive instance name (strip format-specific extensions) + stem = file_path.stem + for ext in (".xml", ".wcnf", ".opb"): + if stem.endswith(ext): + stem = stem[:len(stem) - len(ext)] + break + + # Build structured sidecar + sidecar = { + "dataset": self.dataset_metadata(), + "instance_name": stem, + "source_file": str(file_path.relative_to(self.dataset_dir)), + "category": self.category(), + "instance_metadata": portable, + "format_metadata": format_specific, + } + + if "_metadata_error" in instance_meta: + sidecar["_metadata_error"] = instance_meta["_metadata_error"] + + # Preserve model features from existing sidecar if present + if meta_path.exists(): + try: + with open(meta_path, "r") as f: + existing = json.load(f) + if "model_features" in existing: + sidecar["model_features"] = existing["model_features"] + else: + # Upgrade: extract flat model features from old-style sidecar + model_feats = { + k: v for k, v in existing.items() + if k in _MODEL_FEATURE_FIELDS + or k in ("_feature_error", "_domain_feature_error") + } + if model_feats: + sidecar["model_features"] = model_feats + except (json.JSONDecodeError, IOError): + pass + + with open(meta_path, "w") as f: + json.dump(sidecar, f, indent=2) + + def collect_instance_features(self, file) -> dict: + """ + Override in subclass to provide domain-specific instance features + that augment the generic CP features extracted from the model. + + Arguments: + file: path to the instance file + + Returns: + dict with domain-specific feature fields + """ + return {} + + def collect_features(self): + """ + Extract CP model features for all instances using the dataset's reader. + + Parses each instance into a CPMpy model, extracts generic model features + via extract_model_features(), and optionally collects domain-specific + features via collect_instance_features(). + + Results are stored in the ``model_features`` section of ``.meta.json`` + sidecar files (structured format) or as flat fields (legacy format). + """ + if self.reader is None: + raise ValueError( + f"No reader configured for {self.__class__.__name__}. " + f"Set the 'reader' class attribute to enable feature extraction." + ) + + files = self._list_instances() + + # Filter files that need processing + files_to_process = [] + for file_path in files: + meta_path = self._metadata_path(file_path) + existing = {} + if meta_path.exists(): + with open(meta_path, "r") as f: + existing = json.load(f) + # Skip if features already collected + if isinstance(existing.get("dataset"), dict): + # Structured format — check model_features section + if "model_features" in existing: + continue + else: + # Legacy flat format + if "num_variables" in existing: + continue + files_to_process.append(file_path) + + if not files_to_process: + return + + errors = [] + + # Use tqdm for progress if available + if tqdm is not None: + file_iter = tqdm(files_to_process, desc="Collecting features", unit="instance") + else: + file_iter = files_to_process + print(f"Collecting features for {len(files_to_process)} instances...") + + for file_path in file_iter: + meta_path = self._metadata_path(file_path) + + # Load existing sidecar (or empty dict) + existing = {} + if meta_path.exists(): + with open(meta_path, "r") as f: + existing = json.load(f) + + try: + model = self.reader(str(file_path), open=self.open) + features = extract_model_features(model) + except Exception as e: + features = {"_feature_error": str(e)} + errors.append((str(file_path), str(e))) + + # Collect domain-specific features + try: + domain_features = self.collect_instance_features(str(file_path)) + features.update(domain_features) + except Exception as e: + features["_domain_feature_error"] = str(e) + + # Store features in the appropriate location + if isinstance(existing.get("dataset"), dict): + # Structured format: store in model_features section + existing["model_features"] = features + else: + # Legacy flat format + existing.update(features) + + with open(meta_path, "w") as f: + json.dump(existing, f, indent=2) + + if errors: + warnings.warn( + f"Feature extraction failed for {len(errors)}/{len(files_to_process)} instances. " + f"First error: {errors[0][1]}" + ) + def open(self, instance) -> io.TextIOBase: """ How an instance file from the dataset should be opened. - Especially usefull when files come compressed and won't work with + Especially usefull when files come compressed and won't work with python standard library's 'open', e.g. '.xz', '.lzma'. """ return open(instance, "r") @@ -93,52 +458,93 @@ def metadata(self, file) -> dict: 'name': pathlib.Path(file).stem.replace(self.extension, ''), 'path': file, } + # Load sidecar metadata if it exists + meta_path = self._metadata_path(file) + if meta_path.exists(): + with open(meta_path, "r") as f: + sidecar = json.load(f) + # Handle structured vs flat sidecar format + if isinstance(sidecar.get("dataset"), dict): + # Structured: flatten instance_metadata and format_metadata + metadata.update(sidecar.get("instance_metadata", {})) + metadata.update(sidecar.get("format_metadata", {})) + metadata.update(sidecar.get("model_features", {})) + else: + # Legacy flat format + metadata.update(sidecar) return metadata - + def __len__(self) -> int: """Return the total number of instances.""" - return len(list(self.dataset_dir.rglob(f"*{self.extension}"))) - + return len(self._list_instances()) def __getitem__(self, index: int) -> Tuple[Any, Any]: - if index < 0 or index >= len(self): raise IndexError("Index out of range") - # Get all compressed XML files and sort for deterministic behavior - files = sorted(list(self.dataset_dir.rglob(f"*{self.extension}"))) + files = self._list_instances() file_path = files[index] filename = str(file_path) - # Basic metadata about the instance metadata = self.metadata(file=filename) if self.target_transform: metadata = self.target_transform(metadata) if self.transform: - # does not need to remain a filename... filename = self.transform(filename) - + # Let transforms contribute to metadata (e.g. model verification info) + if hasattr(self.transform, 'enrich_metadata'): + metadata = self.transform.enrich_metadata(filename, metadata) + return filename, metadata @staticmethod - def _download_file(url: str, target: str, destination: Optional[str] = None, - desc: str = None, - chunk_size: int = 1024 * 1024) -> os.PathLike: + def _try_origin(base_url: str, target: str, destination: str, desc: str, chunk_size: int) -> Optional[pathlib.Path]: """ - Download a file from a URL with progress bar and speed information. + Try to download a file from a specific origin URL. + Arguments: + base_url (str): Base URL to try + target (str): Target filename + destination (str): Destination path + desc (str): Description for progress bar + chunk_size (int): Chunk size for download + + Returns: + pathlib.Path if successful, None if failed + """ + try: + full_url = base_url.rstrip('/') + '/' + target.lstrip('/') + req = Request(full_url) + with urlopen(req) as response: + total_size = int(response.headers.get('Content-Length', 0)) + + _Dataset._download_sequential(full_url, destination, total_size, desc, chunk_size) + return pathlib.Path(destination) + except (HTTPError, URLError) as e: + return None + + @staticmethod + def _download_file(url: str, target: str, destination: Optional[str] = None, + desc: str = None, + chunk_size: int = 1024 * 1024, + origins: Optional[List[str]] = None) -> os.PathLike: + """ + Download a file from a URL with progress bar and speed information. + Supports multiple origins with fallback. + This method provides a reusable download function with progress updates similar to pip and uv, showing download progress, speed, and ETA. - + Arguments: - url (str): The URL to download from. + url (str): The original URL to download from (used as fallback). target (str): The target filename to download. destination (str, optional): The destination path to save the file. - desc (str, optional): Description to show in the progress bar. + desc (str, optional): Description to show in the progress bar. If None, uses the filename. chunk_size (int): Size of each chunk for download in bytes (default=1MB). - + origins (List[str], optional): List of alternative URL bases to try first. + Returns: str: The destination path where the downloaded file is saved. """ @@ -146,9 +552,24 @@ def _download_file(url: str, target: str, destination: Optional[str] = None, if desc is None: desc = target + temp_destination = None if destination is None: temp_destination = tempfile.NamedTemporaryFile(delete=False) - + destination = temp_destination.name + else: + # Create parent directory if it doesn't exist and destination has a directory component + dest_dir = os.path.dirname(destination) + if dest_dir: + os.makedirs(dest_dir, exist_ok=True) + + # Try custom origins first if provided + if origins: + for origin_url in origins: + result = _Dataset._try_origin(origin_url, target, destination, desc, chunk_size) + if result is not None: + return result + + # Fall back to original URL try: req = Request(url + target) with urlopen(req) as response: @@ -161,15 +582,90 @@ def _download_file(url: str, target: str, destination: Optional[str] = None, if destination is None: temp_destination.close() - return pathlib.Path(destination if destination is not None else temp_destination.name) + _Dataset._download_sequential(url + target, destination, total_size, desc, chunk_size) + + return pathlib.Path(destination) except (HTTPError, URLError) as e: raise ValueError(f"Failed to download file from {url + target}. Error: {str(e)}") - + + @staticmethod + def _download_parallel(urls_and_targets: List[Tuple[str, str]], base_url: str, + destination_dir: str, desc_prefix: str = "Downloading", + chunk_size: int = 1024 * 1024, + max_workers: Optional[int] = None, + origins: Optional[List[str]] = None) -> List[pathlib.Path]: + """ + Download multiple files in parallel from a base URL. + + Arguments: + urls_and_targets (List[Tuple[str, str]]): List of (url_suffix, target_filename) tuples + base_url (str): Base URL for downloads (used as fallback) + destination_dir (str): Directory to save files + desc_prefix (str): Prefix for progress bar descriptions + chunk_size (int): Chunk size for downloads + max_workers (int, optional): Maximum number of parallel workers. Defaults to min(32, num_files) + origins (List[str], optional): List of alternative URL bases to try first + + Returns: + List[pathlib.Path]: List of downloaded file paths + """ + os.makedirs(destination_dir, exist_ok=True) + + if max_workers is None: + max_workers = min(32, len(urls_and_targets)) + + downloaded_files = [] + errors = [] + + def download_one(url_suffix: str, target: str) -> Tuple[Optional[pathlib.Path], Optional[str]]: + dest_path = os.path.join(destination_dir, target) + desc = f"{desc_prefix} {target}" + + # Try custom origins first + if origins: + for origin_url in origins: + result = _Dataset._try_origin(origin_url, url_suffix + target, dest_path, desc, chunk_size) + if result is not None: + return result, None + + # Fall back to original URL + try: + full_url = base_url.rstrip('/') + '/' + url_suffix.lstrip('/') + target + req = Request(full_url) + with urlopen(req) as response: + total_size = int(response.headers.get('Content-Length', 0)) + + _Dataset._download_sequential(full_url, dest_path, total_size, desc, chunk_size) + return pathlib.Path(dest_path), None + except Exception as e: + return None, str(e) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit(download_one, url_suffix, target): (url_suffix, target) + for url_suffix, target in urls_and_targets + } + + for future in as_completed(futures): + url_suffix, target = futures[future] + result, error = future.result() + if result is not None: + downloaded_files.append(result) + else: + errors.append((target, error)) + + if errors: + error_msg = f"Failed to download {len(errors)}/{len(urls_and_targets)} files. " + error_msg += f"First error: {errors[0][0]} - {errors[0][1]}" + warnings.warn(error_msg) + + return downloaded_files + @staticmethod def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc: str, chunk_size: int = 1024 * 1024): - """Download file sequentially (fallback method).""" + """Download file sequentially with progress bar.""" import sys import os @@ -184,20 +680,20 @@ def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc with urlopen(req) as response: if tqdm is not None: if total_size > 0: - with tqdm(total=total_size, unit='B', unit_scale=True, - unit_divisor=1024, desc=desc, file=sys.stdout, + with tqdm(total=total_size, unit='B', unit_scale=True, + unit_divisor=1024, desc=desc, file=sys.stdout, miniters=1, dynamic_ncols=True, ascii=False) as pbar: with open(filepath, 'wb') as f: while True: - chunk = response.read(chunk_size) + chunk = response.read(chunk_size) if not chunk: break f.write(chunk) pbar.update(len(chunk)) else: # Unknown size - with tqdm(unit='B', unit_scale=True, unit_divisor=1024, - desc=desc, file=sys.stdout, miniters=1, + with tqdm(unit='B', unit_scale=True, unit_divisor=1024, + desc=desc, file=sys.stdout, miniters=1, dynamic_ncols=True, ascii=False) as pbar: with open(filepath, 'wb') as f: while True: @@ -224,7 +720,3 @@ def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc sys.stdout.flush() sys.stdout.write("\n") sys.stdout.flush() - - - - diff --git a/cpmpy/tools/dataset/jsplib.py b/cpmpy/tools/dataset/jsplib.py index 7ce5b36a1..8e9eb8f8c 100644 --- a/cpmpy/tools/dataset/jsplib.py +++ b/cpmpy/tools/dataset/jsplib.py @@ -17,21 +17,36 @@ import cpmpy as cp from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.dataset.config import get_origins class JSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible """ JSP Dataset in a PyTorch compatible format. - + More information on JSPLib can be found here: https://github.com/tamy0612/JSPLIB """ name = "jsplib" - + description = "Job Shop Scheduling Problem benchmark library." + url = "https://github.com/tamy0612/JSPLIB" + license = "" + citation = "" + domain = "scheduling" + format = "JSPLib" + origins = [] # Will be populated from config if available + + @staticmethod + def _reader(file_path, open=open): + from cpmpy.tools.io.jsplib import read_jsplib + return read_jsplib(file_path, open=open) + + reader = _reader + def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False): """ - Initialize the PSPLib Dataset. + Initialize the JSPLib Dataset. Arguments: root (str): Root directory containing the jsp instances (if 'download', instances will be downloaded to this location) @@ -39,21 +54,92 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl target_transform (callable, optional): Optional transform to be applied on the file path download (bool): If True, downloads the dataset from the internet and puts it in `root` directory """ - + self.root = pathlib.Path(root) - self.metadata_file = "instances.json" + self._source_metadata_file = "instances.json" + self._source_metadata = None # Loaded lazily during metadata collection dataset_dir = self.root / self.name + # Load origins from config + if not self.origins: + self.origins = get_origins(self.name) + super().__init__( dataset_dir=dataset_dir, - transform=transform, target_transform=target_transform, + transform=transform, target_transform=target_transform, download=download, extension="" ) def category(self) -> dict: - return {} # no categories - + return {} # no categories + + def _list_instances(self): + """List JSPLib instances, excluding metadata and JSON files.""" + return sorted([ + f for f in self.dataset_dir.rglob("*") + if f.is_file() + and not str(f).endswith(self.METADATA_EXTENSION) + and not str(f).endswith(".json") + ]) + + def collect_instance_metadata(self, file) -> dict: + """Extract metadata from instances.json and instance file header.""" + # Lazy load the source metadata + if self._source_metadata is None: + source_path = self.dataset_dir / self._source_metadata_file + if source_path.exists(): + with open(source_path, "r") as f: + self._source_metadata = json.load(f) + else: + self._source_metadata = [] + + result = {} + + # Extract description from file header comments + try: + with self.open(file) as f: + desc_lines = [] + for line in f: + if not line.startswith("#"): + break + cleaned = line.strip().strip("#").strip() + # Skip separator lines and "instance " lines + if cleaned and not cleaned.startswith("+++") and not cleaned.startswith("instance "): + desc_lines.append(cleaned) + if desc_lines: + result["instance_description"] = " ".join(desc_lines) + except Exception: + pass + + # Merge data from instances.json + stem = pathlib.Path(file).stem + for entry in self._source_metadata: + if entry.get("name") == stem: + result["jobs"] = entry.get("jobs") + result["machines"] = entry.get("machines") + result["optimum"] = entry.get("optimum") + if "bounds" in entry: + result["bounds"] = entry["bounds"] + elif entry.get("optimum") is not None: + result["bounds"] = { + "upper": entry["optimum"], + "lower": entry["optimum"] + } + break + return result + + def __getitem__(self, index): + """Supports both integer index and string name lookup.""" + if isinstance(index, str): + files = self._list_instances() + for file_path in files: + if file_path.stem == index: + idx = files.index(file_path) + return super().__getitem__(idx) + raise IndexError(f"Instance '{index}' not found in dataset") + return super().__getitem__(index) + def download(self): url = "https://github.com/tamy0612/JSPLIB/archive/refs/heads/" # download full repo... @@ -63,10 +149,10 @@ def download(self): print(f"Downloading JSPLib instances from github.com/tamy0612/JSPLIB") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path)) + target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) except ValueError as e: raise ValueError(f"No dataset available on {url}. Error: {str(e)}") - + # Extract files with zipfile.ZipFile(target_download_path, 'r') as zip_ref: self.dataset_dir.mkdir(parents=True, exist_ok=True) @@ -77,62 +163,13 @@ def download(self): filename = pathlib.Path(file_info.filename).name with zip_ref.open(file_info) as source, open(self.dataset_dir / filename, 'wb') as target: target.write(source.read()) - # extract metadata file - with zip_ref.open("JSPLIB-master/instances.json") as source, open(self.dataset_dir / self.metadata_file, 'wb') as target: + # extract source metadata file + with zip_ref.open("JSPLIB-master/instances.json") as source, open(self.dataset_dir / self._source_metadata_file, 'wb') as target: target.write(source.read()) - + # Clean up the zip file target_download_path.unlink() - - def __getitem__(self, index: int|str) -> Tuple[Any, Any]: - """ - Get a single JSPLib instance filename and metadata. - - Args: - index (int or str): Index or name of the instance to retrieve - - Returns: - Tuple[Any, Any]: A tuple containing: - - The filename of the instance - - Metadata dictionary with file name, track, year etc. - """ - if isinstance(index, int) and (index < 0 or index >= len(self)): - raise IndexError("Index out of range") - - # Get all instance files and sort for deterministic behavior # TODO: use natsort instead? - files = sorted(list(self.dataset_dir.rglob("*[!.json]"))) # exclude metadata file - if isinstance(index, int): - file_path = files[index] - elif isinstance(index, str): - for file_path in files: - if file_path.stem == index: - break - else: - raise IndexError(f"Instance {index} not found in dataset") - - filename = str(file_path) - if self.transform: - # does not need to remain a filename... - filename = self.transform(filename) - - with open(self.dataset_dir / self.metadata_file, "r") as f: - for entry in json.load(f): - if entry["name"] == file_path.stem: - metadata = entry - if "bounds" not in metadata: - metadata["bounds"] = {"upper": metadata["optimum"], "lower": metadata["optimum"]} - del metadata['path'] - metadata['path'] = str(file_path) - break - else: - metadata = dict() - - if self.target_transform: - metadata = self.target_transform(metadata) - - return filename, metadata - def open(self, instance: os.PathLike) -> callable: return open(instance, "r") diff --git a/cpmpy/tools/dataset/miplib.py b/cpmpy/tools/dataset/miplib.py index f80634e28..67e7cc8f3 100644 --- a/cpmpy/tools/dataset/miplib.py +++ b/cpmpy/tools/dataset/miplib.py @@ -12,6 +12,7 @@ import io from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.dataset.config import get_origins class MIPLibDataset(_Dataset): # torch.utils.data.Dataset compatible @@ -23,6 +24,60 @@ class MIPLibDataset(_Dataset): # torch.utils.data.Dataset compatible """ name = "miplib" + description = "Mixed Integer Programming Library benchmark instances." + url = "https://miplib.zib.de/" + license = "" + citation = "" + domain = "mixed integer programming" + format = "MPS" + origins = [] # Will be populated from config if available + + @staticmethod + def _reader(file_path, open=open): + from cpmpy.tools.io.scip import read_scip + return read_scip(file_path, open=open) + + reader = _reader + + def collect_instance_metadata(self, file) -> dict: + """Extract row/column counts from MPS file sections.""" + result = {} + try: + with self.open(file) as f: + section = None + num_rows = 0 + columns = set() + has_objective = False + for line in f: + stripped = line.strip() + if stripped.startswith("NAME"): + section = "NAME" + elif stripped == "ROWS": + section = "ROWS" + elif stripped == "COLUMNS": + section = "COLUMNS" + elif stripped in ("RHS", "RANGES", "BOUNDS", "ENDATA"): + section = stripped + elif section == "ROWS" and stripped: + parts = stripped.split() + if parts[0] == "N": + has_objective = True + else: + num_rows += 1 + elif section == "COLUMNS" and stripped: + parts = stripped.split() + if parts: + columns.add(parts[0]) + elif section in ("RHS", "RANGES", "BOUNDS", "ENDATA"): + pass # skip to avoid parsing entire file + if section == "ENDATA": + break + result["mps_num_rows"] = num_rows + result["mps_num_columns"] = len(columns) + result["mps_has_objective"] = has_objective + except Exception: + pass + return result def __init__( self, @@ -53,6 +108,10 @@ def __init__( dataset_dir = self.root / self.name / str(year) / track + # Load origins from config + if not self.origins: + self.origins = get_origins(self.name) + super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, @@ -74,7 +133,7 @@ def download(self): print(f"Downloading MIPLib instances from miplib.zib.de") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path)) + target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) except ValueError as e: raise ValueError(f"No dataset available on {url}. Error: {str(e)}") diff --git a/cpmpy/tools/dataset/mse.py b/cpmpy/tools/dataset/mse.py index dd1fcc163..5c2aa8ad6 100644 --- a/cpmpy/tools/dataset/mse.py +++ b/cpmpy/tools/dataset/mse.py @@ -12,6 +12,7 @@ import io from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.dataset.config import get_origins class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible @@ -29,6 +30,20 @@ class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible """ name = "mse" + description = "MaxSAT Evaluation competition benchmark instances." + url = "https://maxsat-evaluations.github.io/" + license = "" + citation = "" + domain = "maximum satisfiability" + format = "WCNF" + origins = [] # Will be populated from config if available + + @staticmethod + def _reader(file_path, open=open): + from cpmpy.tools.io.wcnf import read_wcnf + return read_wcnf(file_path, open=open) + + reader = _reader def __init__( self, @@ -66,6 +81,10 @@ def __init__( dataset_dir = self.root / self.name / str(year) / track + # Load origins from config + if not self.origins: + self.origins = get_origins(self.name) + super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, @@ -77,7 +96,37 @@ def category(self) -> dict: "year": self.year, "track": self.track } - + + def collect_instance_metadata(self, file) -> dict: + """Extract statistics from WCNF header comments. + + WCNF files from MSE contain JSON-like statistics in comment lines: + nvars, ncls, nhards, nsofts, total_lits, nsoft_wts, and length stats. + """ + import re + result = {} + try: + with self.open(file) as f: + for line in f: + line = line.strip() + if not line.startswith("c"): + break + # Extract all numeric fields from JSON-style comments + for key, meta_key in [ + ("nvars", "wcnf_num_variables"), + ("ncls", "wcnf_num_clauses"), + ("nhards", "wcnf_num_hard_clauses"), + ("nsofts", "wcnf_num_soft_clauses"), + ("total_lits", "wcnf_total_literals"), + ("nsoft_wts", "wcnf_num_distinct_weights"), + ]: + match = re.search(rf'"{key}"\s*:\s*(\d+)', line) + if match: + result[meta_key] = int(match.group(1)) + except Exception: + pass + return result + def download(self): url = f"https://www.cs.helsinki.fi/group/coreo/MSE{self.year}-instances/" @@ -87,7 +136,7 @@ def download(self): print(f"Downloading MaxSAT Eval {self.year} {self.track} instances from cs.helsinki.fi") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path)) + target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) except ValueError as e: raise ValueError(f"No dataset available for year {self.year} and track {self.track}. Error: {str(e)}") diff --git a/cpmpy/tools/dataset/nurserostering.py b/cpmpy/tools/dataset/nurserostering.py index 8c23d9a45..5a989f342 100644 --- a/cpmpy/tools/dataset/nurserostering.py +++ b/cpmpy/tools/dataset/nurserostering.py @@ -15,6 +15,7 @@ import cpmpy as cp from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.dataset.config import get_origins # Optional dependencies try: @@ -39,6 +40,20 @@ class NurseRosteringDataset(_Dataset): # torch.utils.data.Dataset compatible """ name = "nurserostering" + description = "Nurse rostering benchmark instances from schedulingbenchmarks.org." + url = "https://schedulingbenchmarks.org/nrp/" + license = "" + citation = "" + domain = "scheduling" + format = "NRP text" + origins = [] # Will be populated from config if available + + @staticmethod + def _reader(file_path, open=open): + from cpmpy.tools.io.nurserostering import read_nurserostering + return read_nurserostering(file_path, open=open) + + reader = _reader def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False, sort_key=None): """ @@ -59,6 +74,10 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl dataset_dir = self.root / self.name + # Load origins from config + if not self.origins: + self.origins = get_origins(self.name) + super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, @@ -66,7 +85,24 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl ) def category(self) -> dict: - return {} # no categories + return {} # no categories + + def collect_instance_metadata(self, file) -> dict: + """Extract scheduling metadata from nurse rostering instance.""" + try: + data = parse_scheduling_period(file) + return { + "horizon": data["horizon"], + "num_staff": len(data["staff"]), + "num_shifts": len(data["shifts"]), + "num_days_off": len(data.get("days_off", [])), + "num_shift_on_requests": len(data.get("shift_on", []) or []), + "num_shift_off_requests": len(data.get("shift_off", []) or []), + "num_cover_requirements": len(data.get("cover", []) or []), + } + except Exception: + pass + return {} def download(self): @@ -77,7 +113,7 @@ def download(self): print(f"Downloading Nurserostering instances from schedulingbenchmarks.org") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path)) + target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) except ValueError as e: raise ValueError(f"No dataset available on {url}. Error: {str(e)}") diff --git a/cpmpy/tools/dataset/opb.py b/cpmpy/tools/dataset/opb.py index 2d0d876ba..429161c7f 100644 --- a/cpmpy/tools/dataset/opb.py +++ b/cpmpy/tools/dataset/opb.py @@ -12,6 +12,7 @@ import io from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.dataset.config import get_origins class OPBDataset(_Dataset): @@ -28,6 +29,20 @@ class OPBDataset(_Dataset): """ name = "opb" + description = "Pseudo-Boolean Competition benchmark instances." + url = "https://www.cril.univ-artois.fr/PB25/" + license = "" + citation = "" + domain = "pseudo-boolean optimization" + format = "OPB" + origins = [] # Will be populated from config if available + + @staticmethod + def _reader(file_path, open=open): + from cpmpy.tools.io.opb import read_opb + return read_opb(file_path, open=open) + + reader = _reader def __init__( self, @@ -68,6 +83,10 @@ def __init__( dataset_dir = self.root / self.name / str(year) / track / ('selected' if self.competition else 'normalized') + # Load origins from config + if not self.origins: + self.origins = get_origins(self.name) + super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, @@ -80,9 +99,38 @@ def category(self) -> dict: "track": self.track } - def metadata(self, file) -> dict: - # Add the author to the metadata - return super().metadata(file) | {'author': str(file).split(os.sep)[-1].split("_")[0],} + def collect_instance_metadata(self, file) -> dict: + """Extract metadata from OPB filename and file header. + + Parses the `* #variable= ... #constraint= ...` header line and + extracts the author from the filename convention (first part before `_`). + """ + import re + result = {} + # Author from filename + filename = pathlib.Path(file).name + parts = filename.split("_") + if len(parts) > 1: + result["author"] = parts[0] + # Parse header for variable/constraint counts + try: + with self.open(file) as f: + for line in f: + line = line.strip() + if not line.startswith("*"): + break + var_match = re.search(r'#variable=\s*(\d+)', line) + con_match = re.search(r'#constraint=\s*(\d+)', line) + if var_match: + result["opb_num_variables"] = int(var_match.group(1)) + if con_match: + result["opb_num_constraints"] = int(con_match.group(1)) + prod_match = re.search(r'#product=\s*(\d+)', line) + if prod_match: + result["opb_num_products"] = int(prod_match.group(1)) + except Exception: + pass + return result def download(self): @@ -93,7 +141,7 @@ def download(self): print(f"Downloading OPB {self.year} {self.track} {'competition' if self.competition else 'non-competition'} instances from www.cril.univ-artois.fr") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path)) + target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) except ValueError as e: raise ValueError(f"No dataset available for year {self.year}. Error: {str(e)}") diff --git a/cpmpy/tools/dataset/psplib.py b/cpmpy/tools/dataset/psplib.py index ac685976a..aaa019d76 100644 --- a/cpmpy/tools/dataset/psplib.py +++ b/cpmpy/tools/dataset/psplib.py @@ -10,6 +10,7 @@ import zipfile from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.dataset.config import get_origins class PSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible """ @@ -19,6 +20,20 @@ class PSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible """ name = "psplib" + description = "Project Scheduling Problem Library (RCPSP) benchmark instances." + url = "https://www.om-db.wi.tum.de/psplib/main.html" + license = "" + citation = "" + domain = "scheduling" + format = "PSPLIB SM" + origins = [] # Will be populated from config if available + + @staticmethod + def _reader(file_path, open=open): + from cpmpy.tools.io.rcpsp import read_rcpsp + return read_rcpsp(file_path, open=open) + + reader = _reader def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False): """ @@ -54,6 +69,10 @@ def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", dataset_dir = self.root / self.name / self.variant / self.family + # Load origins from config + if not self.origins: + self.origins = get_origins(self.name) + super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, @@ -66,6 +85,62 @@ def category(self) -> dict: "family": self.family } + def collect_instance_metadata(self, file) -> dict: + """Extract project metadata from SM file header.""" + import re + result = {} + try: + with self.open(file) as f: + lines = f.readlines() + + in_project_info = False + in_resource_avail = False + for i, raw_line in enumerate(lines): + line = raw_line.strip() + if line.startswith("jobs"): + match = re.search(r':\s*(\d+)', line) + if match: + result["num_jobs"] = int(match.group(1)) + elif line.startswith("horizon"): + match = re.search(r':\s*(\d+)', line) + if match: + result["horizon"] = int(match.group(1)) + elif line.startswith("- renewable"): + match = re.search(r':\s*(\d+)', line) + if match: + result["num_renewable_resources"] = int(match.group(1)) + elif line.startswith("- nonrenewable"): + match = re.search(r':\s*(\d+)', line) + if match: + result["num_nonrenewable_resources"] = int(match.group(1)) + elif line.startswith("- doubly constrained"): + match = re.search(r':\s*(\d+)', line) + if match: + result["num_doubly_constrained_resources"] = int(match.group(1)) + elif line.startswith("PROJECT INFORMATION"): + in_project_info = True + elif in_project_info and not line.startswith("*") and not line.startswith("pronr"): + # Data line: pronr #jobs rel.date duedate tardcost MPM-Time + parts = line.split() + if len(parts) >= 6: + result["duedate"] = int(parts[3]) + result["tardcost"] = int(parts[4]) + result["mpm_time"] = int(parts[5]) + in_project_info = False + elif line.startswith("RESOURCEAVAILABILITIES"): + in_resource_avail = True + elif in_resource_avail and not line.startswith("*") and not line.startswith("R ") and not line.startswith("N "): + # Resource availability values line + parts = line.split() + if parts: + result["resource_availabilities"] = [int(x) for x in parts] + in_resource_avail = False + elif line.startswith("PRECEDENCE RELATIONS") or line.startswith("REQUESTS/DURATIONS"): + in_project_info = False + except Exception: + pass + return result + def download(self): url = "https://www.om-db.wi.tum.de/psplib/files/" @@ -75,7 +150,7 @@ def download(self): print(f"Downloading PSPLib {self.variant} {self.family} instances from www.om-db.wi.tum.de") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path)) + target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) except ValueError as e: raise ValueError(f"No dataset available for variant {self.variant} and family {self.family}. Error: {str(e)}") diff --git a/cpmpy/tools/dataset/xcsp3.py b/cpmpy/tools/dataset/xcsp3.py index 2fd989da7..e39dbf726 100644 --- a/cpmpy/tools/dataset/xcsp3.py +++ b/cpmpy/tools/dataset/xcsp3.py @@ -11,6 +11,7 @@ import io from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.dataset.config import get_origins class XCSP3Dataset(_Dataset): # torch.utils.data.Dataset compatible @@ -28,7 +29,46 @@ class XCSP3Dataset(_Dataset): # torch.utils.data.Dataset compatible """ name = "xcsp3" - + description = "XCSP3 competition benchmark instances for constraint satisfaction and optimization." + url = "https://xcsp.org/instances/" + license = "" + citation = "" + domain = "constraint programming" + format = "XCSP3" + origins = [] # Will be populated from config if available + + @staticmethod + def _reader(file_path, open=open): + from cpmpy.tools.xcsp3.parser import read_xcsp3 + return read_xcsp3(file_path, open=open) + + reader = _reader + + def collect_instance_metadata(self, file) -> dict: + """Extract instance type (CSP/COP) from XCSP3 XML root element.""" + import re + result = {} + try: + with self.open(file) as f: + # Read only the first few lines to find the root element + header = "" + for _ in range(10): + line = f.readline() + if not line: + break + header += line + if ">" in line: + break + match = re.search(r'type\s*=\s*"([^"]+)"', header) + if match: + result["instance_type"] = match.group(1) + match = re.search(r'format\s*=\s*"([^"]+)"', header) + if match: + result["xcsp_format"] = match.group(1) + except Exception: + pass + return result + def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transform=None, target_transform=None, download: bool = False): """ Initialize the XCSP3 Dataset. @@ -45,6 +85,10 @@ def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transf if not track: raise ValueError("Track must be specified, e.g. COP, CSP, MiniCOP, ...") + # Load origins from config + if not self.origins: + self.origins = get_origins(self.name) + super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, @@ -66,7 +110,7 @@ def download(self): print(f"Downloading XCSP3 {self.year} instances from www.cril.univ-artois.fr") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path)) + target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) except ValueError as e: raise ValueError(f"No dataset available for year {self.year}. Error: {str(e)}") From 95198ff11a77d20b56b5264b2a6c707852473b1e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 12 Feb 2026 17:12:34 +0100 Subject: [PATCH 101/152] experimental download origin --- cpmpy/tools/dataset/config.py | 74 +++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 cpmpy/tools/dataset/config.py diff --git a/cpmpy/tools/dataset/config.py b/cpmpy/tools/dataset/config.py new file mode 100644 index 000000000..f2fc0e14c --- /dev/null +++ b/cpmpy/tools/dataset/config.py @@ -0,0 +1,74 @@ +""" +Configuration for CPMpy dataset download origins. + +This module provides configuration for custom download origins that can be used +as alternatives to the original dataset sources. Origins are tried in order, +falling back to the original source if all custom origins fail. + +Configuration can be set via: +1. Environment variables (CPMPY_DATASET_ORIGINS_{DATASET_NAME}) +2. This config file +3. Class attributes in dataset classes +""" + +import os +from typing import Dict, List, Optional + +# Default origins configuration +# Format: {dataset_name: [list of URL bases]} +_DEFAULT_ORIGINS: Dict[str, List[str]] = { + # Example: + # "xcsp3": ["https://cpmpy-datasets.example.com/xcsp3"], + # "mse": ["https://cpmpy-datasets.example.com/mse"], +} + +def get_origins(dataset_name: str) -> List[str]: + """ + Get custom origins for a dataset. + + Checks in order: + 1. Environment variable CPMPY_DATASET_ORIGINS_{DATASET_NAME} + 2. _DEFAULT_ORIGINS dictionary + 3. Returns empty list (no custom origins) + + Arguments: + dataset_name (str): Name of the dataset (e.g., "xcsp3", "mse") + + Returns: + List[str]: List of origin URL bases to try + """ + # Check environment variable first + env_var = f"CPMPY_DATASET_ORIGINS_{dataset_name.upper()}" + env_value = os.getenv(env_var) + if env_value: + # Split by comma and strip whitespace + return [url.strip() for url in env_value.split(",") if url.strip()] + + # Check default origins + return _DEFAULT_ORIGINS.get(dataset_name, []) + +def set_default_origin(dataset_name: str, origin_url: str): + """ + Set a default origin URL for a dataset (for programmatic configuration). + + Arguments: + dataset_name (str): Name of the dataset + origin_url (str): Base URL for the origin + """ + if dataset_name not in _DEFAULT_ORIGINS: + _DEFAULT_ORIGINS[dataset_name] = [] + if origin_url not in _DEFAULT_ORIGINS[dataset_name]: + _DEFAULT_ORIGINS[dataset_name].append(origin_url) + +def set_default_origins(dataset_name: str, origin_urls: List[str]): + """ + Set multiple default origin URLs for a dataset (for programmatic configuration). + + Arguments: + dataset_name (str): Name of the dataset + origin_urls (List[str]): List of base URLs for origins + """ + _DEFAULT_ORIGINS[dataset_name] = origin_urls.copy() + + + From 41ee1119f8f81a8ad0771c29618e41437e6e0bc3 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 12 Feb 2026 17:13:09 +0100 Subject: [PATCH 102/152] imports --- cpmpy/tools/dataset/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cpmpy/tools/dataset/__init__.py b/cpmpy/tools/dataset/__init__.py index 65fb041b8..87980e7ce 100644 --- a/cpmpy/tools/dataset/__init__.py +++ b/cpmpy/tools/dataset/__init__.py @@ -1,3 +1,4 @@ +from ._base import extract_model_features, portable_instance_metadata from .miplib import MIPLibDataset from .jsplib import JSPLibDataset from .psplib import PSPLibDataset From f4771593e32945e08546543781c0543dd2538b34 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 12 Feb 2026 17:13:32 +0100 Subject: [PATCH 103/152] dataset cli --- cpmpy/cli.py | 152 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 148 insertions(+), 4 deletions(-) diff --git a/cpmpy/cli.py b/cpmpy/cli.py index 2ac220b0d..2daaab955 100644 --- a/cpmpy/cli.py +++ b/cpmpy/cli.py @@ -1,14 +1,14 @@ """ Command-line interface for CPMpy. -This module provides a simple CLI to interact with CPMpy, primarily to display -version information about CPMpy itself and the available solver backends. - Usage: cpmpy Commands: - version Show the CPMpy library version and the versions of installed solver backends. + version Show CPMpy version and solver backends + dataset list List available datasets + dataset info Show dataset details + dataset download [options] Download a dataset """ import argparse @@ -16,10 +16,131 @@ import cpmpy as cp +# ── Dataset class registry ─────────────────────────────────────── +# Maps CLI name -> (class, {param_name: default_value}) + +DATASET_CLASSES = { + "xcsp3": ("XCSP3Dataset", {"year": 2024, "track": "CSP"}), + "mse": ("MSEDataset", {"year": 2024, "track": "exact-unweighted"}), + "opb": ("OPBDataset", {"year": 2024, "track": "OPT-LIN"}), + "miplib": ("MIPLibDataset", {"year": 2024, "track": "exact-unweighted"}), + "psplib": ("PSPLibDataset", {"variant": "rcpsp", "family": "j30"}), + "nurserostering": ("NurseRosteringDataset", {}), + "jsplib": ("JSPLibDataset", {}), +} + + +def _import_dataset_class(class_name): + """Lazily import a dataset class from cpmpy.tools.dataset.""" + import cpmpy.tools.dataset as ds + return getattr(ds, class_name) + + +# ── Commands ───────────────────────────────────────────────────── + def command_version(args): print(f"CPMpy version: {__version__}") cp.SolverLookup().print_version() + +def command_dataset_list(args): + print("Available datasets:\n") + for name, (cls_name, params) in DATASET_CLASSES.items(): + try: + cls = _import_dataset_class(cls_name) + desc = getattr(cls, "description", "") + except Exception: + desc = "" + line = f" {name:<20s}" + if desc: + # Truncate long descriptions + short = desc if len(desc) <= 60 else desc[:57] + "..." + line += f" {short}" + print(line) + print(f"\nUse 'cpmpy dataset info ' for details.") + + +def command_dataset_info(args): + name = args.name.lower() + if name not in DATASET_CLASSES: + print(f"Unknown dataset: {args.name}") + print(f"Available: {', '.join(DATASET_CLASSES)}") + return + + cls_name, params = DATASET_CLASSES[name] + try: + cls = _import_dataset_class(cls_name) + meta = cls.dataset_metadata() + except Exception as e: + print(f"Error loading dataset class: {e}") + return + + print(f"\n {meta.get('name', name).upper()}") + print(f" {'─' * 40}") + if meta.get("description"): + print(f" {meta['description']}") + print() + for key in ("domain", "format", "url", "license"): + val = meta.get(key) + if val: + print(f" {key:<12s} {val}") + + if params: + print(f"\n Parameters:") + for p, default in params.items(): + print(f" --{p:<14s} (default: {default})") + + # Show example usage + print(f"\n Example:") + arg_parts = [] + for p, default in params.items(): + arg_parts.append(f"--{p} {default}") + extra = (" " + " ".join(arg_parts)) if arg_parts else "" + print(f" cpmpy dataset download {name}{extra}") + print() + + +def command_dataset_download(args): + name = args.name.lower() + if name not in DATASET_CLASSES: + print(f"Unknown dataset: {args.name}") + print(f"Available: {', '.join(DATASET_CLASSES)}") + return + + cls_name, param_defaults = DATASET_CLASSES[name] + + # Build constructor kwargs from CLI args + kwargs = {"root": args.root, "download": True} + + for param, default in param_defaults.items(): + cli_val = getattr(args, param, None) + if cli_val is not None: + # Cast to int if the default is int + if isinstance(default, int): + try: + cli_val = int(cli_val) + except ValueError: + pass + kwargs[param] = cli_val + else: + kwargs[param] = default + + cls = _import_dataset_class(cls_name) + print(f"Downloading {name} dataset...") + for param, default in param_defaults.items(): + print(f" {param}: {kwargs.get(param, default)}") + print(f" root: {args.root}") + print() + + try: + dataset = cls(**kwargs) + print(f"\nDone! {len(dataset)} instances downloaded to {args.root}/") + except Exception as e: + print(f"\nError: {e}") + + +# ── Main ───────────────────────────────────────────────────────── + def main(): parser = argparse.ArgumentParser(description="CPMpy command line interface") subparsers = parser.add_subparsers(dest="command", required=True) @@ -28,5 +149,28 @@ def main(): version_parser = subparsers.add_parser("version", help="Show version information on CPMpy and its solver backends") version_parser.set_defaults(func=command_version) + # cpmpy dataset ... + dataset_parser = subparsers.add_parser("dataset", help="Browse and download benchmark datasets") + dataset_sub = dataset_parser.add_subparsers(dest="dataset_command", required=True) + + # cpmpy dataset list + list_parser = dataset_sub.add_parser("list", help="List available datasets") + list_parser.set_defaults(func=command_dataset_list) + + # cpmpy dataset info + info_parser = dataset_sub.add_parser("info", help="Show dataset details") + info_parser.add_argument("name", help="Dataset name") + info_parser.set_defaults(func=command_dataset_info) + + # cpmpy dataset download [options] + dl_parser = dataset_sub.add_parser("download", help="Download a dataset") + dl_parser.add_argument("name", help="Dataset name") + dl_parser.add_argument("--root", default="./data", help="Download directory (default: ./data)") + dl_parser.add_argument("--year", default=None, help="Year/edition") + dl_parser.add_argument("--track", default=None, help="Track/category") + dl_parser.add_argument("--variant", default=None, help="Variant (e.g. for psplib)") + dl_parser.add_argument("--family", default=None, help="Family (e.g. for psplib)") + dl_parser.set_defaults(func=command_dataset_download) + args = parser.parse_args() args.func(args) From 31b0aeeb230a288845a6da173de0315f1b587d2a Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 12 Feb 2026 17:15:17 +0100 Subject: [PATCH 104/152] remove duplicate writer --- cpmpy/tools/opb/writer.py | 239 -------------------------------------- 1 file changed, 239 deletions(-) delete mode 100644 cpmpy/tools/opb/writer.py diff --git a/cpmpy/tools/opb/writer.py b/cpmpy/tools/opb/writer.py deleted file mode 100644 index 4b739da5f..000000000 --- a/cpmpy/tools/opb/writer.py +++ /dev/null @@ -1,239 +0,0 @@ -""" - This file implements helper functions for exporting CPMpy models from and to OPB format. - OPB is a textual format to represent Pseudo-Boolean problems. - The header of the file is formatted as ``* #variable= #constraint= ``. - If the number of variables and constraints are not given, it is inferred by the writer. - - Each remaining line of the file is formatted as a constraint. - A constraint is formatted as a string of integers. - An integer represents a Boolean variable and a negative Boolean variable is represented using a `'-'` sign. -""" - -import cpmpy as cp - -from cpmpy.transformations.normalize import toplevel_list,simplify_boolean -from cpmpy.transformations.safening import no_partial_functions, safen_objective -from cpmpy.transformations.decompose_global import decompose_in_tree, decompose_objective -from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective -from cpmpy.transformations.reification import only_implies, only_bv_reifies -from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv_wsum -from cpmpy.transformations.int2bool import int2bool, _encode_lin_expr -from cpmpy.transformations.get_variables import get_variables -from cpmpy.expressions.variables import _IntVarImpl, NegBoolView, _BoolVarImpl -from cpmpy.expressions.core import Operator, Comparison - -from cpmpy import __version__ - -def write_opb(model, fname=None, encoding="auto"): - """ - Export a CPMpy model to the OPB (Pseudo-Boolean) format. - - This function transforms the given CPMpy model into OPB format, which is a standard textual - format for representing Pseudo-Boolean optimization problems. The OPB file will contain - a header specifying the number of variables and constraints, the objective (optional), and the - list of constraints using integer-weighted Boolean variables. - - Args: - model (cp.Model): The CPMpy model to export. - fname (str, optional): The file name to write the OPB output to. If None, the OPB string is returned. - encoding (str, optional): The encoding used for `int2bool`. Options: ("auto", "direct", "order", "binary"). - - Returns: - str or None: The OPB string if `fname` is None, otherwise nothing (writes to file). - - Format: - * #variable= #constraint= - * OPB file generated by CPMpy version - min/max: ; - ; - ; - ... - - Note: - Some solvers only support variable names of the form x. The OPB writer will remap - all CPMpy variables to such a format internally. - - Example: - >>> from cpmpy import * - >>> x = boolvar(shape=3) - >>> m = Model(x[0] + x[1] + x[2] >= 2) - >>> print(write_opb(m)) - """ - - csemap, ivarmap = dict(), dict() - opb_cons = _transform(model.constraints, csemap, ivarmap, encoding) - - if model.objective_ is not None: - opb_obj, const, extra_cons = _transform_objective(model.objective_, csemap, ivarmap, encoding) - opb_cons += extra_cons - else: - opb_obj = None - - # Form header and variable mapping - # Use all variables occurring in constraints and the objective - all_vars = get_variables(opb_cons + ([opb_obj] if opb_obj is not None else [])) - out = [ - f"* #variable= {len(all_vars)} #constraint= {len(opb_cons)}", - f"* OPB file generated by CPMpy version {__version__}", - ] - # Remap variables to 'x1', 'x2', ..., the standard OPB way - varmap = {v: f"x{i+1}" for i, v in enumerate(all_vars)} - - # Write objective, if present - if model.objective_ is not None: - objective_str = _wsum_to_str(opb_obj, varmap) - out.append(f"{'min' if model.objective_is_min else 'max'}: {objective_str};") - - # Write constraints - for cons in opb_cons: - assert isinstance(cons, Comparison), f"Expected a comparison, but got {cons}" - lhs, rhs = cons.args - constraint_str = f"{_wsum_to_str(lhs, varmap)} {cons.name} {rhs};" - out.append(constraint_str) - - # Output to file or string - contents = "\n".join(out) - if fname is None: - return contents - else: - with open(fname, "w") as f: - f.write(contents) - -def _normalized_comparison(lst_of_expr): - """ - Convert a list of linear CPMpy expressions into OPB-compatible pseudo-Boolean constraints. - - Transforms a list of Boolean-linear CPMpy expressions (as output by `linearize_constraint`) into a list - of OPB-normalized constraints, expressed as comparisons between weighted Boolean sums - (using "wsum") and integer constants. Handles Boolean vars, reifications, implications, - and ensures all equalities are decomposed into two inequalities. - - Args: - lst_of_expr (list): List of CPMpy Boolean-linear expressions. - - Returns: - list: List of normalized CPMpy `Comparison` objects representing pseudo-Boolean constraints. - """ - newlist = [] - for cpm_expr in lst_of_expr: - if isinstance(cpm_expr, cp.BoolVal) and cpm_expr.value() is False: - raise NotImplementedError(f"Cannot transform {cpm_expr} to OPB constraint") - - # single Boolean variable - if isinstance(cpm_expr, _BoolVarImpl): - cpm_expr = Operator("sum", [cpm_expr]) >= 1 - - # implication - if isinstance(cpm_expr, Operator) and cpm_expr.name == "->": - bv, subexpr = cpm_expr.args - assert isinstance(subexpr, _BoolVarImpl), "Only bv -> bv should reach here, but got {subexpr}" - cpm_expr = Operator("wsum", [[-1, 1], [bv, subexpr]]) >= 0 - newlist.append(cpm_expr) - continue - - # Comparison, can be single Boolean variable or (weighted) sum of Boolean variables - if isinstance(cpm_expr, Comparison): - lhs, rhs = cpm_expr.args - - if isinstance(lhs, _BoolVarImpl): - lhs = Operator("sum", [lhs]) - if lhs.name == "sum": - lhs = Operator("wsum", [[1]*len(lhs.args), lhs.args]) - - assert isinstance(lhs, Operator) and lhs.name == "wsum", f"Expected a wsum, but got {lhs}" - - # convert comparisons into >= constraints - if cpm_expr.name == "==": - newlist += _normalized_comparison([lhs <= rhs]) - newlist += _normalized_comparison([lhs >= rhs]) - elif cpm_expr.name == ">=": - newlist.append(lhs >= rhs) - elif cpm_expr.name == "<=": - new_weights = [-w for w in lhs.args[0]] - newlist.append(Operator("wsum", [new_weights, lhs.args[1]]) >= -rhs) - else: - raise ValueError(f"Unknown comparison {cpm_expr.name}") - else: - raise NotImplementedError(f"Expected a comparison, but got {cpm_expr}") - - return newlist - -def _wsum_to_str(cpm_expr, varmap): - """ - Convert a weighted sum CPMpy expression to a string in OPB format. - - args: - cpm_expr (Operator): wsum CPMpy expression - varmap (dict): dictionary mapping CPMpy variables to OPB variable names - """ - assert isinstance(cpm_expr, Operator) and cpm_expr.name == "wsum", f"Expected a wsum, but got {cpm_expr}" - weights, args = cpm_expr.args - - out = [] - for w, var in zip(weights, args): - var = varmap[var] if not isinstance(var, NegBoolView) else f"~{varmap[var._bv]}" - if w < 0: - out.append(f"- {w} {var}") - elif w > 0: - out.append(f"+ {w} {var}") - else: - pass # zero weight, ignore - - str_out = " ".join(out) - return str_out - -def _transform(cpm_expr, csemap, ivarmap, encoding="auto"): - """ - Transform a list of CPMpy expressions into a list of Pseudo-Boolean constraints. - """ - - cpm_cons = toplevel_list(cpm_expr) - cpm_cons = no_partial_functions(cpm_cons, safen_toplevel={"div", "mod", "element"}) - cpm_cons = decompose_in_tree(cpm_cons, - supported={"alldifferent"}, # alldiff has a specialized MIP decomp in linearize - csemap=csemap - ) - cpm_cons = simplify_boolean(cpm_cons) - cpm_cons = flatten_constraint(cpm_cons, csemap=csemap) # flat normal form - cpm_cons = only_bv_reifies(cpm_cons, csemap=csemap) - cpm_cons = only_implies(cpm_cons, csemap=csemap) - cpm_cons = linearize_constraint( - cpm_cons, supported=frozenset({"sum", "wsum"}), csemap=csemap - ) - cpm_cons = int2bool(cpm_cons, ivarmap, encoding=encoding) - - return _normalized_comparison(cpm_cons) - -def _transform_objective(expr, csemap, ivarmap, encoding="auto"): - """ - Transform a CPMpy objective expression into a weighted sum expression - """ - - # transform objective - obj, safe_cons = safen_objective(expr) - obj, decomp_cons = decompose_objective(obj, supported={"alldifferent"}, - csemap=csemap) - obj, flat_cons = flatten_objective(obj, csemap=csemap) - obj = only_positive_bv_wsum(obj) # remove negboolviews - - weights, xs, const = [], [], 0 - # we assume obj is a var, a sum or a wsum (over int and bool vars) - if isinstance(obj, _IntVarImpl) or isinstance(obj, NegBoolView): # includes _BoolVarImpl - weights = [1] - xs = [obj] - elif obj.name == "sum": - xs = obj.args - weights = [1] * len(xs) - elif obj.name == "wsum": - weights, xs = obj.args - else: - raise NotImplementedError(f"OPB: Non supported objective {obj} (yet?)") - - terms, cons, k = _encode_lin_expr(ivarmap, xs, weights, encoding) - - # remove terms with coefficient 0 (`only_positive_coefficients_` may return them and RC2 does not accept them) - terms = [(w, x) for w,x in terms if w != 0] - - obj = Operator("wsum", [[w for w,x in terms], [x for w,x in terms]]) - return obj, const, safe_cons + decomp_cons + flat_cons - From 4bedebda570976775e7bb2f055143e38251c9f98 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 12 Feb 2026 17:24:25 +0100 Subject: [PATCH 105/152] Dataset transform helpers --- cpmpy/tools/dataset/__init__.py | 1 + cpmpy/tools/dataset/transforms.py | 525 ++++++++++++++++++++++++++++++ 2 files changed, 526 insertions(+) create mode 100644 cpmpy/tools/dataset/transforms.py diff --git a/cpmpy/tools/dataset/__init__.py b/cpmpy/tools/dataset/__init__.py index 87980e7ce..26099b64a 100644 --- a/cpmpy/tools/dataset/__init__.py +++ b/cpmpy/tools/dataset/__init__.py @@ -6,3 +6,4 @@ from .xcsp3 import XCSP3Dataset from .opb import OPBDataset from .mse import MSEDataset +from .transforms import Compose, Open, Parse, Serialize, Translate, SaveToFile, Lambda, extract_format_metadata diff --git a/cpmpy/tools/dataset/transforms.py b/cpmpy/tools/dataset/transforms.py new file mode 100644 index 000000000..b7205f5dc --- /dev/null +++ b/cpmpy/tools/dataset/transforms.py @@ -0,0 +1,525 @@ +""" +Composable Transforms for CPMpy Datasets + +Provides composable transform classes inspired by torchvision.transforms. +Transforms can be chained using :class:`Compose` and passed as the +``transform`` or ``target_transform`` argument to any Dataset subclass. + +================= +List of classes +================= + +.. autosummary:: + :nosignatures: + + Compose + Open + Parse + Serialize + Translate + SaveToFile + Lambda + +Example usage:: + + from cpmpy.tools.dataset import MSEDataset, Compose, Parse, Serialize + from cpmpy.tools.io.wcnf import read_wcnf + + dataset = MSEDataset(root=".", year=2024, track="exact-weighted") + + # Chain: parse WCNF files, then serialize to DIMACS + transform = Compose([ + Parse(read_wcnf, open=dataset.open), + Serialize("dimacs"), + ]) + dataset.transform = transform + + for dimacs_string, metadata in dataset: + print(dimacs_string[:100]) +""" + +import json +import os +import re + +_builtins_open = open # capture before any parameter shadowing + + +def extract_format_metadata(content, format_name): + """Extract format-specific metadata from a translated file content string. + + Parses format headers to extract statistics like variable/constraint counts. + + Arguments: + content (str): The file content string. + format_name (str): The format name (e.g., ``"opb"``, ``"dimacs"``, ``"mps"``). + + Returns: + dict with format-prefixed metadata fields. + """ + result = {} + + if format_name == "opb": + for line in content.split('\n'): + if not line.startswith('*'): + break + match = re.search(r'#variable=\s*(\d+)', line) + if match: + result["opb_num_variables"] = int(match.group(1)) + match = re.search(r'#constraint=\s*(\d+)', line) + if match: + result["opb_num_constraints"] = int(match.group(1)) + match = re.search(r'#product=\s*(\d+)', line) + if match: + result["opb_num_products"] = int(match.group(1)) + + elif format_name == "dimacs": + match = re.search(r'^p\s+(?:w?cnf)\s+(\d+)\s+(\d+)', content, re.MULTILINE) + if match: + result["dimacs_num_variables"] = int(match.group(1)) + result["dimacs_num_clauses"] = int(match.group(2)) + + elif format_name == "mps": + section = None + num_rows = 0 + columns = set() + for line in content.split('\n'): + stripped = line.strip() + if stripped.startswith("NAME"): + section = "NAME" + elif stripped == "ROWS": + section = "ROWS" + elif stripped == "COLUMNS": + section = "COLUMNS" + elif stripped in ("RHS", "RANGES", "BOUNDS", "ENDATA"): + section = stripped + elif section == "ROWS" and stripped: + parts = stripped.split() + if parts[0] != "N": + num_rows += 1 + elif section == "COLUMNS" and stripped: + parts = stripped.split() + if parts: + columns.add(parts[0]) + if section == "ENDATA": + break + if num_rows or columns: + result["mps_num_rows"] = num_rows + result["mps_num_columns"] = len(columns) + + elif format_name == "lp": + # Count constraints in the "Subject To" section + in_subject_to = False + num_constraints = 0 + for line in content.split('\n'): + stripped = line.strip().lower() + if stripped in ("subject to", "st", "s.t."): + in_subject_to = True + elif stripped in ("bounds", "binary", "generals", "end"): + in_subject_to = False + elif in_subject_to and stripped and ":" in stripped: + num_constraints += 1 + if num_constraints: + result["lp_num_constraints"] = num_constraints + + return result + + +def _enrich_from_model(model, metadata): + """Add decision variable and objective info from a CPMpy Model to metadata. + + This is called by transforms that produce CPMpy models (Parse, Translate) + via their ``enrich_metadata`` method. It adds: + + - ``decision_variables``: list of dicts with name, type, lb, ub for each variable + - ``objective``: string representation of the objective expression (if any) + - ``objective_is_min``: True if minimizing, False if maximizing (if any) + """ + if not hasattr(model, 'constraints'): + return metadata # not a CPMpy Model + + from cpmpy.transformations.get_variables import get_variables_model + from cpmpy.expressions.variables import _BoolVarImpl + + variables = get_variables_model(model) + metadata['decision_variables'] = [ + { + "name": v.name, + "type": "bool" if isinstance(v, _BoolVarImpl) else "int", + "lb": int(v.lb), + "ub": int(v.ub), + } + for v in variables + ] + + if model.objective_ is not None: + metadata['objective'] = str(model.objective_) + metadata['objective_is_min'] = bool(model.objective_is_min) + + return metadata + + +class Compose: + """ + Composes several transforms together, applying them sequentially. + + Each transform in the sequence receives the output of the previous one. + Transforms that define ``enrich_metadata(data, metadata)`` can contribute + additional fields to the metadata dictionary. Each sub-transform's + ``enrich_metadata`` receives the intermediate result *it* produced, so a + :class:`Parse` inside ``Compose([Parse(...), Serialize(...)])`` sees the + CPMpy model, not the final serialized string. + + Arguments: + transforms (list[callable]): List of transforms to compose. + + Example:: + + >>> transform = Compose([ + ... Parse(read_wcnf, open=dataset.open), + ... Serialize("dimacs"), + ... ]) + >>> dataset = MSEDataset(transform=transform) + >>> dimacs_string, metadata = dataset[0] + """ + + def __init__(self, transforms): + if not isinstance(transforms, (list, tuple)): + raise TypeError("transforms must be a list or tuple of callables") + self.transforms = list(transforms) + self._steps = [] # (transform, its_output) pairs from last __call__ + + def __call__(self, x): + self._steps = [] + for t in self.transforms: + x = t(x) + self._steps.append((t, x)) + return x + + def enrich_metadata(self, data, metadata): + """Delegate to each sub-transform's enrich_metadata with its own output.""" + for t, result in self._steps: + if hasattr(t, 'enrich_metadata'): + metadata = t.enrich_metadata(result, metadata) + return metadata + + def __repr__(self): + lines = [f"{self.__class__.__name__}(["] + for t in self.transforms: + lines.append(f" {t},") + lines.append("])") + return "\n".join(lines) + + +class Open: + """ + Transform that opens a file path and returns its text contents. + Handles decompression via the provided ``open`` callable. + + Arguments: + open (callable): A callable that opens a file path and returns a + file-like object. Typically ``dataset.open``. Defaults to + Python's built-in ``open``. + + Example:: + + >>> dataset = MSEDataset(transform=Open(open=dataset.open)) + >>> raw_string, metadata = dataset[0] + """ + + def __init__(self, open=_builtins_open): + self._open = open + + def __call__(self, file_path): + with self._open(file_path) as f: + return f.read() + + def __repr__(self): + if self._open is _builtins_open: + return f"{self.__class__.__name__}()" + return f"{self.__class__.__name__}(open={self._open})" + + +class Parse: + """ + Transform that parses a file path into a CPMpy model using a reader function. + + Implements ``enrich_metadata`` to add model verification information + (decision variables, objective) to the metadata dictionary. This is + called automatically by the dataset's ``__getitem__``. + + Arguments: + reader (callable): A reader function such as ``read_wcnf``, ``read_opb``, + ``read_scip``, ``read_dimacs``, etc. + open (callable, optional): A callable to open files, passed to the reader + as the ``open`` keyword argument. If None, the reader uses its default. + **kwargs: Additional keyword arguments passed to the reader. + + Example:: + + >>> from cpmpy.tools.io.wcnf import read_wcnf + >>> dataset = MSEDataset(transform=Parse(read_wcnf, open=dataset.open)) + >>> model, metadata = dataset[0] + >>> metadata['decision_variables'] # list of variable descriptors + >>> metadata['objective'] # objective expression string (if any) + """ + + def __init__(self, reader, open=None, **kwargs): + self.reader = reader + self._open = open + self.kwargs = kwargs + + def __call__(self, file_path): + if self._open is not None: + return self.reader(file_path, open=self._open, **self.kwargs) + return self.reader(file_path, **self.kwargs) + + def enrich_metadata(self, data, metadata): + """Add model verification info if data is a CPMpy Model.""" + return _enrich_from_model(data, metadata) + + def __repr__(self): + reader_name = getattr(self.reader, '__name__', repr(self.reader)) + return f"{self.__class__.__name__}(reader={reader_name})" + + +class Serialize: + """ + Transform that serializes a CPMpy model to a string in a given format. + + Arguments: + format (str): Output format name (e.g., ``"dimacs"``, ``"mps"``, ``"opb"``). + Must be a format supported by :func:`cpmpy.tools.io.writer.write`. + **kwargs: Additional keyword arguments passed to the writer + (e.g., ``header``, ``verbose``). + + Example:: + + >>> transform = Compose([ + ... Parse(read_wcnf, open=dataset.open), + ... Serialize("dimacs"), + ... ]) + """ + + def __init__(self, format, **kwargs): + self.format = format + self.kwargs = kwargs + + def __call__(self, model): + from cpmpy.tools.io.writer import write + return write(model, format=self.format, file_path=None, **self.kwargs) + + def __repr__(self): + return f"{self.__class__.__name__}(format='{self.format}')" + + +class Translate: + """ + Transform that translates a file from one format to another. + Combines reading (parsing) and writing (serialization) in one step. + + Implements ``enrich_metadata`` to add model verification information + from the intermediate CPMpy model to the metadata dictionary. + + Arguments: + reader (callable): A reader function (e.g., ``read_wcnf``, ``read_opb``). + format (str): Output format name (e.g., ``"dimacs"``, ``"mps"``). + open (callable, optional): A callable to open compressed files, + passed to the reader. + **kwargs: Additional keyword arguments passed to the writer. + + Example:: + + >>> transform = Translate(read_wcnf, "dimacs", open=dataset.open) + >>> dataset = MSEDataset(transform=transform) + >>> dimacs_string, metadata = dataset[0] + >>> metadata['decision_variables'] # from the intermediate model + """ + + def __init__(self, reader, format, open=None, **kwargs): + self.reader = reader + self.format = format + self._open = open + self.kwargs = kwargs + self._last_model = None + + def __call__(self, file_path): + from cpmpy.tools.io.writer import write + + if self._open is not None: + model = self.reader(file_path, open=self._open) + else: + model = self.reader(file_path) + + self._last_model = model + return write(model, format=self.format, file_path=None, **self.kwargs) + + def enrich_metadata(self, data, metadata): + """Add model verification info from the intermediate model.""" + if self._last_model is not None: + metadata = _enrich_from_model(self._last_model, metadata) + return metadata + + def __repr__(self): + reader_name = getattr(self.reader, '__name__', repr(self.reader)) + return f"{self.__class__.__name__}(reader={reader_name}, format='{self.format}')" + + +class SaveToFile: + """ + Transform that writes its input string to a file and returns the file path. + + When ``write_metadata=True``, also writes a ``.meta.json`` sidecar file + alongside each output file. The sidecar contains portable instance + metadata from the dataset (filtered by + :func:`~cpmpy.tools.dataset._base.portable_instance_metadata`) and + format-specific metadata extracted from the written content. + + Arguments: + output_dir (str): Directory to write files to (created if needed). + extension (str): File extension for output files (e.g., ``".cnf"``, ``".mps"``). + naming (callable, optional): Function that receives the current data + and returns a filename stem. If None, uses a counter. + write_metadata (bool): If True, writes a ``.meta.json`` sidecar file + next to each saved file. Requires being used inside a + :class:`Compose` with the dataset's ``__getitem__``. + target_format (str, optional): Target format name for format-specific + metadata extraction. If None, inferred from extension. + + Example:: + + >>> transform = Compose([ + ... Translate(read_wcnf, "dimacs", open=dataset.open), + ... SaveToFile("output/", extension=".cnf", write_metadata=True), + ... ]) + """ + + def __init__(self, output_dir, extension="", naming=None, + write_metadata=False, target_format=None): + self.output_dir = output_dir + self.extension = extension + self.naming = naming + self.write_metadata = write_metadata + self.target_format = target_format + self._counter = 0 + self._last_path = None + self._last_content = None + + def __call__(self, content): + os.makedirs(self.output_dir, exist_ok=True) + + if self.naming is not None: + name = self.naming(content) + else: + name = f"instance_{self._counter}" + self._counter += 1 + + file_path = os.path.join(self.output_dir, name + self.extension) + with _builtins_open(file_path, "w") as f: + f.write(content) + self._last_path = file_path + self._last_content = content + return file_path + + def enrich_metadata(self, data, metadata): + """Write a metadata sidecar alongside the saved file if enabled. + + The sidecar mirrors the structure used by ``translate_datasets.py``: + ``dataset``, ``instance_name``, ``category``, ``instance_metadata``, + ``translation``, and ``format_metadata`` sections. + """ + if not self.write_metadata or self._last_path is None: + return metadata + + from cpmpy.tools.dataset._base import portable_instance_metadata + + sidecar = {} + + # Dataset-level metadata (if present in the metadata dict) + if "dataset" in metadata: + # When called from __getitem__, metadata has 'dataset' as a string name. + # Try to reconstruct richer dataset info from what's available. + sidecar["dataset"] = {"name": metadata.get("dataset", "")} + + # Instance identification + sidecar["instance_name"] = metadata.get("name", "") + if "path" in metadata: + sidecar["source_file"] = metadata["path"] + + # Category (year, track, variant, etc. — whatever the dataset provides) + # These are the non-standard keys that category() returns + _known_base = {"dataset", "name", "path"} + category_keys = { + k: v for k, v in metadata.items() + if k in ("year", "track", "variant", "family") + } + if category_keys: + sidecar["category"] = category_keys + + # Portable instance metadata + sidecar["instance_metadata"] = portable_instance_metadata(metadata) + + # Translation info + fmt = self.target_format or self._infer_format() + import cpmpy + from cpmpy.tools.io.writer import writer_dependencies + translation = { + "target_format": fmt or "", + "cpmpy_version": cpmpy.__version__, + } + if fmt: + deps = writer_dependencies(fmt) + if deps: + translation["writer_dependencies"] = deps + sidecar["translation"] = translation + + # Format-specific metadata from the written content + if fmt and self._last_content: + sidecar["format_metadata"] = extract_format_metadata( + self._last_content, fmt + ) + + sidecar_path = self._last_path + ".meta.json" + with _builtins_open(sidecar_path, "w") as f: + json.dump(sidecar, f, indent=2) + + return metadata + + def _infer_format(self): + """Infer format name from the file extension.""" + ext_to_format = { + ".cnf": "dimacs", ".opb": "opb", ".mps": "mps", + ".lp": "lp", ".fzn": "fzn", ".gms": "gms", ".pip": "pip", + } + return ext_to_format.get(self.extension) + + def __repr__(self): + return f"{self.__class__.__name__}(output_dir='{self.output_dir}', extension='{self.extension}')" + + +class Lambda: + """ + Wraps an arbitrary callable with a descriptive name for better repr. + + Arguments: + fn (callable): The function to wrap. + name (str, optional): Display name for repr. Defaults to the + function's ``__name__`` attribute. + + Example:: + + >>> transform = Compose([ + ... Parse(read_wcnf, open=dataset.open), + ... Lambda(lambda m: len(m.constraints), name="count_constraints"), + ... ]) + """ + + def __init__(self, fn, name=None): + self.fn = fn + self.name = name or getattr(fn, '__name__', 'lambda') + + def __call__(self, x): + return self.fn(x) + + def __repr__(self): + return f"{self.__class__.__name__}(name='{self.name}')" From 0b36a5689317f0c8de96dd621a4ee9263ef8b72e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 13 Feb 2026 11:27:31 +0100 Subject: [PATCH 106/152] Cleanup dataset base --- cpmpy/tools/dataset/_base.py | 261 +++++++++++++++++++++-------------- 1 file changed, 157 insertions(+), 104 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index b8bc38e67..956d74a07 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -68,7 +68,7 @@ def portable_instance_metadata(metadata: dict) -> dict: } -def extract_model_features(model) -> dict: +def _extract_model_features(model) -> dict: """ Extract generic CP features from a CPMpy Model. @@ -135,6 +135,8 @@ class _Dataset(ABC): The `_Dataset` class provides a standardized interface for downloading and accessing benchmark instances. This class should not be used on its own. + Instead have a look at one of the concrete subclasses, providing access to + well-known datasets from the community. """ # Extension for metadata sidecar files @@ -161,6 +163,21 @@ def __init__( extension:str=".txt", **kwargs ): + """ + Constructor for the _Dataset base class. + + Arguments: + dataset_dir (str): Path to the dataset directory. + transform (callable, optional): Optional transform applied to the instance file path. + target_transform (callable, optional): Optional transform applied to the metadata dictionary. + download (bool): If True, downloads the dataset if it does not exist locally (default=False). + extension (str): Extension of the instance files. + + Raises: + ValueError: If the dataset directory does not exist and `download=False`, + or if the requested year/track combination is not available. + ValueError: If the dataset directory does not contain any instance files. + """ self.dataset_dir = pathlib.Path(dataset_dir) self.transform = transform self.target_transform = target_transform @@ -168,32 +185,21 @@ def __init__( if not self.dataset_dir.exists(): if not download: - raise ValueError(f"Dataset not found. Please set download=True to download the dataset.") + raise ValueError("Dataset not found. Please set download=True to download the dataset.") else: self.download() self._collect_all_metadata() files = self._list_instances() print(f"Finished downloading {len(files)} instances") - # Generate sidecar metadata for existing datasets that lack them - self._collect_all_metadata() - files = self._list_instances() if len(files) == 0: raise ValueError(f"Cannot find any instances inside dataset {self.dataset_dir}. Is it a valid dataset? If so, please report on GitHub.") - @classmethod - def dataset_metadata(cls) -> dict: - """Return dataset-level metadata as a dictionary.""" - return { - "name": cls.name, - "description": cls.description, - "url": cls.url, - "license": cls.license, - "citation": cls.citation, - "domain": cls.domain, - "format": cls.format, - } + + # ---------------------------------------------------------------------------- # + # Methods to implement in subclasses: # + # ---------------------------------------------------------------------------- # @abstractmethod def category(self) -> dict: @@ -208,15 +214,103 @@ def category(self) -> dict: @abstractmethod def download(self, *args, **kwargs): """ - How the dataset should be downloaded. + Download the dataset. """ pass + + # ---------------------------------------------------------------------------- # + # Methods to optionally overwrite # + # ---------------------------------------------------------------------------- # + + def collect_instance_metadata(self, file: pathlib.Path) -> dict: + """ + Provide domain-specific instance metadata. + Called once after download for each instance. + + Arguments: + file: path to the instance file + + Returns: + dict with instance-specific metadata fields + """ + return {} + + def collect_instance_features(self, file: pathlib.Path) -> dict: + """ + Collect domain-specific instance features + that augment the generic CP features extracted from the model. + + Arguments: + file: path to the instance file + + Returns: + dict with domain-specific feature fields + """ + return {} + + def open(self, instance) -> io.TextIOBase: + """ + How an instance file from the dataset should be opened. + Especially usefull when files come compressed and won't work with + python standard library's 'open', e.g. '.xz', '.lzma'. + """ + return open(instance, "r") + + + # ---------------------------------------------------------------------------- # + # Public interface # + # ---------------------------------------------------------------------------- # + + def metadata(self, file: pathlib.Path) -> dict: + metadata = self.category() | { + 'dataset': self.name, + 'name': pathlib.Path(file).stem.replace(self.extension, ''), + 'path': file, + } + # Load sidecar metadata if it exists + meta_path = self._metadata_path(file) + if meta_path.exists(): + with open(meta_path, "r") as f: + sidecar = json.load(f) + # Handle structured vs flat sidecar format + if isinstance(sidecar.get("dataset"), dict): + # Structured: flatten instance_metadata and format_metadata + metadata.update(sidecar.get("instance_metadata", {})) + metadata.update(sidecar.get("format_metadata", {})) + metadata.update(sidecar.get("model_features", {})) + else: + # Legacy flat format + metadata.update(sidecar) + return metadata + + @classmethod + def dataset_metadata(cls) -> dict: + """ + Return dataset-level metadata as a dictionary. + """ + return { + "name": cls.name, + "description": cls.description, + "url": cls.url, + "license": cls.license, + "citation": cls.citation, + "domain": cls.domain, + "format": cls.format, + } + + + # ---------------------------------------------------------------------------- # + # Internals # + # ---------------------------------------------------------------------------- # + + # ------------------------------ Instance access ----------------------------- # + def _list_instances(self) -> list: """ List all instance files, excluding metadata sidecar files. - Returns a sorted list of pathlib.Path objects for all instance files + Returns a sorted list of `pathlib.Path` objects for all instance files matching the dataset's extension pattern. """ return sorted([ @@ -224,34 +318,57 @@ def _list_instances(self) -> list: if f.is_file() and not str(f).endswith(self.METADATA_EXTENSION) ]) - def _metadata_path(self, instance_path) -> pathlib.Path: - """Return the path to the .meta.json sidecar file for a given instance.""" - return pathlib.Path(str(instance_path) + self.METADATA_EXTENSION) + def __len__(self) -> int: + """Return the total number of instances.""" + return len(self._list_instances()) - def collect_instance_metadata(self, file) -> dict: + def __getitem__(self, index: int) -> Tuple[Any, Any]: + if index < 0 or index >= len(self): + raise IndexError("Index out of range") + + files = self._list_instances() + file_path = files[index] + filename = str(file_path) + + metadata = self.metadata(file=filename) + if self.target_transform: + metadata = self.target_transform(metadata) + + if self.transform: + filename = self.transform(filename) + # Let transforms contribute to metadata (e.g. model verification info) + if hasattr(self.transform, 'enrich_metadata'): + metadata = self.transform.enrich_metadata(filename, metadata) + + return filename, metadata + + + # ---------------------------- Metadata collection --------------------------- # + + def _metadata_path(self, instance_path: pathlib.Path) -> pathlib.Path: """ - Override in subclass to provide domain-specific instance metadata. - Called once after download for each instance. + Return the path to the `.meta.json` sidecar file for a given instance. Arguments: - file: path to the instance file + instance_path: path to the instance file Returns: - dict with instance-specific metadata fields + path to the `.meta.json` sidecar file """ - return {} + return pathlib.Path(str(instance_path) + self.METADATA_EXTENSION) def _collect_all_metadata(self, force=False): - """Collect and store structured metadata sidecar files for all instances. + """ + Collect and store structured metadata sidecar files for all instances. - Writes a structured ``.meta.json`` sidecar alongside each instance with: + Writes a structured `.meta.json` sidecar alongside each instance with: - - ``dataset``: dataset-level metadata (name, description, url, ...) - - ``instance_name``: logical instance name (filename stem) - - ``source_file``: path to the instance file - - ``category``: dataset category labels (year, track, variant, family) - - ``instance_metadata``: portable domain-specific metadata - - ``format_metadata``: format-specific metadata from the source format + - `dataset`: dataset-level metadata (name, description, url, ...) + - `instance_name`: logical instance name (filename stem) + - `source_file`: path to the instance file + - `category`: dataset category labels (year, track, variant, family) + - `instance_metadata`: portable domain-specific metadata + - `format_metadata`: format-specific metadata from the source format Arguments: force (bool): If True, re-collect instance metadata even if sidecar @@ -341,20 +458,7 @@ def _collect_all_metadata(self, force=False): with open(meta_path, "w") as f: json.dump(sidecar, f, indent=2) - def collect_instance_features(self, file) -> dict: - """ - Override in subclass to provide domain-specific instance features - that augment the generic CP features extracted from the model. - - Arguments: - file: path to the instance file - - Returns: - dict with domain-specific feature fields - """ - return {} - - def collect_features(self): + def _collect_features(self): """ Extract CP model features for all instances using the dataset's reader. @@ -415,7 +519,7 @@ def collect_features(self): try: model = self.reader(str(file_path), open=self.open) - features = extract_model_features(model) + features = _extract_model_features(model) except Exception as e: features = {"_feature_error": str(e)} errors.append((str(file_path), str(e))) @@ -444,59 +548,8 @@ def collect_features(self): f"First error: {errors[0][1]}" ) - def open(self, instance) -> io.TextIOBase: - """ - How an instance file from the dataset should be opened. - Especially usefull when files come compressed and won't work with - python standard library's 'open', e.g. '.xz', '.lzma'. - """ - return open(instance, "r") - - def metadata(self, file) -> dict: - metadata = self.category() | { - 'dataset': self.name, - 'name': pathlib.Path(file).stem.replace(self.extension, ''), - 'path': file, - } - # Load sidecar metadata if it exists - meta_path = self._metadata_path(file) - if meta_path.exists(): - with open(meta_path, "r") as f: - sidecar = json.load(f) - # Handle structured vs flat sidecar format - if isinstance(sidecar.get("dataset"), dict): - # Structured: flatten instance_metadata and format_metadata - metadata.update(sidecar.get("instance_metadata", {})) - metadata.update(sidecar.get("format_metadata", {})) - metadata.update(sidecar.get("model_features", {})) - else: - # Legacy flat format - metadata.update(sidecar) - return metadata - - def __len__(self) -> int: - """Return the total number of instances.""" - return len(self._list_instances()) - - def __getitem__(self, index: int) -> Tuple[Any, Any]: - if index < 0 or index >= len(self): - raise IndexError("Index out of range") - - files = self._list_instances() - file_path = files[index] - filename = str(file_path) - - metadata = self.metadata(file=filename) - if self.target_transform: - metadata = self.target_transform(metadata) - - if self.transform: - filename = self.transform(filename) - # Let transforms contribute to metadata (e.g. model verification info) - if hasattr(self.transform, 'enrich_metadata'): - metadata = self.transform.enrich_metadata(filename, metadata) - - return filename, metadata + + # ----------------------------- Download methods ----------------------------- # @staticmethod def _try_origin(base_url: str, target: str, destination: str, desc: str, chunk_size: int) -> Optional[pathlib.Path]: From 671eeb4fcf50a8cd781f850ffeb68260a59d1442 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 13 Feb 2026 11:28:07 +0100 Subject: [PATCH 107/152] Remove duplicate methods --- cpmpy/tools/dataset/_base.py | 103 ----------------------------------- 1 file changed, 103 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index 956d74a07..de6523573 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -236,19 +236,6 @@ def collect_instance_metadata(self, file: pathlib.Path) -> dict: """ return {} - def collect_instance_features(self, file: pathlib.Path) -> dict: - """ - Collect domain-specific instance features - that augment the generic CP features extracted from the model. - - Arguments: - file: path to the instance file - - Returns: - dict with domain-specific feature fields - """ - return {} - def open(self, instance) -> io.TextIOBase: """ How an instance file from the dataset should be opened. @@ -458,96 +445,6 @@ def _collect_all_metadata(self, force=False): with open(meta_path, "w") as f: json.dump(sidecar, f, indent=2) - def _collect_features(self): - """ - Extract CP model features for all instances using the dataset's reader. - - Parses each instance into a CPMpy model, extracts generic model features - via extract_model_features(), and optionally collects domain-specific - features via collect_instance_features(). - - Results are stored in the ``model_features`` section of ``.meta.json`` - sidecar files (structured format) or as flat fields (legacy format). - """ - if self.reader is None: - raise ValueError( - f"No reader configured for {self.__class__.__name__}. " - f"Set the 'reader' class attribute to enable feature extraction." - ) - - files = self._list_instances() - - # Filter files that need processing - files_to_process = [] - for file_path in files: - meta_path = self._metadata_path(file_path) - existing = {} - if meta_path.exists(): - with open(meta_path, "r") as f: - existing = json.load(f) - # Skip if features already collected - if isinstance(existing.get("dataset"), dict): - # Structured format — check model_features section - if "model_features" in existing: - continue - else: - # Legacy flat format - if "num_variables" in existing: - continue - files_to_process.append(file_path) - - if not files_to_process: - return - - errors = [] - - # Use tqdm for progress if available - if tqdm is not None: - file_iter = tqdm(files_to_process, desc="Collecting features", unit="instance") - else: - file_iter = files_to_process - print(f"Collecting features for {len(files_to_process)} instances...") - - for file_path in file_iter: - meta_path = self._metadata_path(file_path) - - # Load existing sidecar (or empty dict) - existing = {} - if meta_path.exists(): - with open(meta_path, "r") as f: - existing = json.load(f) - - try: - model = self.reader(str(file_path), open=self.open) - features = _extract_model_features(model) - except Exception as e: - features = {"_feature_error": str(e)} - errors.append((str(file_path), str(e))) - - # Collect domain-specific features - try: - domain_features = self.collect_instance_features(str(file_path)) - features.update(domain_features) - except Exception as e: - features["_domain_feature_error"] = str(e) - - # Store features in the appropriate location - if isinstance(existing.get("dataset"), dict): - # Structured format: store in model_features section - existing["model_features"] = features - else: - # Legacy flat format - existing.update(features) - - with open(meta_path, "w") as f: - json.dump(existing, f, indent=2) - - if errors: - warnings.warn( - f"Feature extraction failed for {len(errors)}/{len(files_to_process)} instances. " - f"First error: {errors[0][1]}" - ) - # ----------------------------- Download methods ----------------------------- # From ca67973f412e6209140281477dc89d7382225e7c Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 13 Feb 2026 11:40:21 +0100 Subject: [PATCH 108/152] simplify metadata logic --- cpmpy/tools/dataset/_base.py | 80 +++++++++++++++++------------------- 1 file changed, 38 insertions(+), 42 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index de6523573..a35d60229 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -18,15 +18,7 @@ from urllib.request import HTTPError, Request, urlopen from concurrent.futures import ThreadPoolExecutor, as_completed -def format_bytes(bytes_num): - """ - Format bytes into human-readable string (e.g., KB, MB, GB). - """ - for unit in ['bytes', 'KB', 'MB', 'GB', 'TB']: - if bytes_num < 1024.0: - return f"{bytes_num:.1f} {unit}" - bytes_num /= 1024.0 - +# tqdm as an optional dependency, provides prettier progress bars try: from tqdm import tqdm except ImportError: @@ -44,6 +36,16 @@ def format_bytes(bytes_num): _FORMAT_SPECIFIC_PREFIXES = ("opb_", "wcnf_", "mps_", "xcsp_", "dimacs_") +def _format_bytes(bytes_num): + """ + Format bytes into human-readable string (e.g., KB, MB, GB). + """ + for unit in ['bytes', 'KB', 'MB', 'GB', 'TB']: + if bytes_num < 1024.0: + return f"{bytes_num:.1f} {unit}" + bytes_num /= 1024.0 + + def portable_instance_metadata(metadata: dict) -> dict: """Filter sidecar metadata to only portable, domain-specific fields. @@ -129,6 +131,11 @@ def _count_constraints(c): } +def extract_model_features(model) -> dict: + """Public wrapper for extracting generic CPMpy model features.""" + return _extract_model_features(model) + + class _Dataset(ABC): """ Abstract base class for PyTorch-style datasets of benchmarking instances. @@ -260,15 +267,10 @@ def metadata(self, file: pathlib.Path) -> dict: if meta_path.exists(): with open(meta_path, "r") as f: sidecar = json.load(f) - # Handle structured vs flat sidecar format - if isinstance(sidecar.get("dataset"), dict): - # Structured: flatten instance_metadata and format_metadata - metadata.update(sidecar.get("instance_metadata", {})) - metadata.update(sidecar.get("format_metadata", {})) - metadata.update(sidecar.get("model_features", {})) - else: - # Legacy flat format - metadata.update(sidecar) + # Structured: flatten instance_metadata, format_metadata, and model_features + metadata.update(sidecar.get("instance_metadata", {})) + metadata.update(sidecar.get("format_metadata", {})) + metadata.update(sidecar.get("model_features", {})) return metadata @classmethod @@ -369,15 +371,6 @@ def _collect_all_metadata(self, force=False): meta_path = self._metadata_path(file_path) if force or not meta_path.exists(): files_to_process.append(file_path) - else: - # Upgrade old flat sidecars to structured format - try: - with open(meta_path, "r") as f: - existing = json.load(f) - if not isinstance(existing.get("dataset"), dict): - files_to_process.append(file_path) - except (json.JSONDecodeError, IOError): - files_to_process.append(file_path) if not files_to_process: return @@ -423,25 +416,29 @@ def _collect_all_metadata(self, force=False): if "_metadata_error" in instance_meta: sidecar["_metadata_error"] = instance_meta["_metadata_error"] - # Preserve model features from existing sidecar if present + # Preserve previously extracted model features if present. + # Otherwise, compute them from the parsed model when possible. + model_features = None if meta_path.exists(): try: with open(meta_path, "r") as f: existing = json.load(f) if "model_features" in existing: - sidecar["model_features"] = existing["model_features"] - else: - # Upgrade: extract flat model features from old-style sidecar - model_feats = { - k: v for k, v in existing.items() - if k in _MODEL_FEATURE_FIELDS - or k in ("_feature_error", "_domain_feature_error") - } - if model_feats: - sidecar["model_features"] = model_feats + model_features = existing["model_features"] except (json.JSONDecodeError, IOError): pass + if model_features is None: + if not callable(self.reader): + raise TypeError( + f"Cannot extract model features for {file_path}: " + "no dataset reader configured. If unexpected, please open an issue on GitHub." + ) + model = self.reader(str(file_path), open=self.open) + model_features = extract_model_features(model) + + sidecar["model_features"] = model_features + with open(meta_path, "w") as f: json.dump(sidecar, f, indent=2) @@ -471,7 +468,7 @@ def _try_origin(base_url: str, target: str, destination: str, desc: str, chunk_s _Dataset._download_sequential(full_url, destination, total_size, desc, chunk_size) return pathlib.Path(destination) - except (HTTPError, URLError) as e: + except (HTTPError, URLError): return None @staticmethod @@ -617,7 +614,6 @@ def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc chunk_size: int = 1024 * 1024): """Download file sequentially with progress bar.""" import sys - import os # Convert to Path if it's a string if isinstance(filepath, str): @@ -664,9 +660,9 @@ def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc downloaded += len(chunk) if total_size > 0: percent = (downloaded / total_size) * 100 - sys.stdout.write(f"\r\033[KDownloading {desc}: {format_bytes(downloaded)}/{format_bytes(total_size)} ({percent:.1f}%)") + sys.stdout.write(f"\r\033[KDownloading {desc}: {_format_bytes(downloaded)}/{_format_bytes(total_size)} ({percent:.1f}%)") else: - sys.stdout.write(f"\r\033[KDownloading {desc}: {format_bytes(downloaded)}...") + sys.stdout.write(f"\r\033[KDownloading {desc}: {_format_bytes(downloaded)}...") sys.stdout.flush() sys.stdout.write("\n") sys.stdout.flush() From 21ea64bfd2f80a7b11d111368cc114331338e955 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 13 Feb 2026 11:40:44 +0100 Subject: [PATCH 109/152] More IO optional dependencies --- setup.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bb1b96f37..10d2815ab 100644 --- a/setup.py +++ b/setup.py @@ -38,8 +38,16 @@ def get_version(rel_path): format_dependencies = { "io.mps": ["pyscipopt"], + "io.lp": ["pyscipopt"], + "io.cip": ["pyscipopt"], + "io.fzn": ["pyscipopt"], + "io.gms": ["pyscipopt"], + "io.pip": ["pyscipopt"], "io.scip": ["pyscipopt"], - "io.dimacs": ["pyscipopt"], + "io.dimacs": solver_dependencies["pindakaas"], # Required for write_dimacs (uses to_cnf transformation) + "io.opb": [], # No external dependencies + "io.wcnf": [], # No external dependencies + "io.xcsp3": ["pycsp3"], } setup( From 782ad0d4763dbf6cb0cc39bf18b1a012f3447c3e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 13 Feb 2026 11:43:45 +0100 Subject: [PATCH 110/152] clarify name --- cpmpy/tools/dataset/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index a35d60229..f6d0a9263 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -256,7 +256,7 @@ def open(self, instance) -> io.TextIOBase: # Public interface # # ---------------------------------------------------------------------------- # - def metadata(self, file: pathlib.Path) -> dict: + def instance_metadata(self, file: pathlib.Path) -> dict: metadata = self.category() | { 'dataset': self.name, 'name': pathlib.Path(file).stem.replace(self.extension, ''), @@ -319,7 +319,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: file_path = files[index] filename = str(file_path) - metadata = self.metadata(file=filename) + metadata = self.instance_metadata(file=filename) if self.target_transform: metadata = self.target_transform(metadata) From 7bd68da85cbf4edf6af734e93e84e76dbecf6f5b Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 13 Feb 2026 11:49:16 +0100 Subject: [PATCH 111/152] support multiple citations --- cpmpy/tools/dataset/_base.py | 9 +++++++-- cpmpy/tools/dataset/nurserostering.py | 10 +++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index f6d0a9263..5d04861ad 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -153,7 +153,7 @@ class _Dataset(ABC): description = "" url = "" license = "" - citation = "" + citation: List[str] = [] domain = "" format = "" reader = None # callable(file_path, open=open) -> cp.Model @@ -278,12 +278,17 @@ def dataset_metadata(cls) -> dict: """ Return dataset-level metadata as a dictionary. """ + if isinstance(cls.citation, str): + citations = [cls.citation] if cls.citation else [] + else: + citations = list(cls.citation) + return { "name": cls.name, "description": cls.description, "url": cls.url, "license": cls.license, - "citation": cls.citation, + "citation": citations, "domain": cls.domain, "format": cls.format, } diff --git a/cpmpy/tools/dataset/nurserostering.py b/cpmpy/tools/dataset/nurserostering.py index 5a989f342..a6d454c4f 100644 --- a/cpmpy/tools/dataset/nurserostering.py +++ b/cpmpy/tools/dataset/nurserostering.py @@ -42,11 +42,15 @@ class NurseRosteringDataset(_Dataset): # torch.utils.data.Dataset compatible name = "nurserostering" description = "Nurse rostering benchmark instances from schedulingbenchmarks.org." url = "https://schedulingbenchmarks.org/nrp/" - license = "" - citation = "" + citation = [ + "Strandmark, P., Qu, Y. and Curtois, T. First-order linear programming in a column generation-based heuristic approach to the nurse rostering problem. Computers & Operations Research, 2020. 120, p. 104945.", + "Demirovic, E., Musliu, N., and Winter, F. Modeling and solving staff scheduling with partial weighted maxSAT. Annals of Operations Research, 2019. 275(1): p. 79-99.", + "Smet P. Constraint reformulation for nurse rostering problems, in: PATAT 2018 twelfth international conference on the practice and theory of automated timetabling, Vienna, August, 2018, p. 69-80.", + "Rahimian, E., Akartunali, K., and Levine, J. A hybrid integer programming and variable neighbourhood search algorithm to solve nurse rostering problems. European Journal of Operational Research, 2017. 258(2): p. 411-423.", + ] domain = "scheduling" format = "NRP text" - origins = [] # Will be populated from config if available + @staticmethod def _reader(file_path, open=open): From f36e21334938e444e47c28c7f9a7bf5c9dab8e5a Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Mon, 16 Feb 2026 12:33:33 +0100 Subject: [PATCH 112/152] Consistent dataset objects --- cpmpy/tools/dataset/_base.py | 3 + cpmpy/tools/dataset/jsplib.py | 69 ++++++++--------- cpmpy/tools/dataset/miplib.py | 105 ++++++++++++-------------- cpmpy/tools/dataset/mse.py | 24 ++---- cpmpy/tools/dataset/nurserostering.py | 54 ++++++------- cpmpy/tools/dataset/opb.py | 23 ++---- cpmpy/tools/dataset/psplib.py | 24 +++--- cpmpy/tools/dataset/xcsp3.py | 73 +++++++++--------- 8 files changed, 170 insertions(+), 205 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index 5d04861ad..b53be45fe 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -189,6 +189,9 @@ def __init__( self.transform = transform self.target_transform = target_transform self.extension = extension + if not self.origins: + from cpmpy.tools.dataset.config import get_origins + self.origins = get_origins(self.name) if not self.dataset_dir.exists(): if not download: diff --git a/cpmpy/tools/dataset/jsplib.py b/cpmpy/tools/dataset/jsplib.py index 8e9eb8f8c..8cbc2289b 100644 --- a/cpmpy/tools/dataset/jsplib.py +++ b/cpmpy/tools/dataset/jsplib.py @@ -7,17 +7,14 @@ https://github.com/tamy0612/JSPLIB """ -import io import os import json import pathlib -from typing import Tuple, Any import zipfile import numpy as np import cpmpy as cp from cpmpy.tools.dataset._base import _Dataset -from cpmpy.tools.dataset.config import get_origins class JSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible @@ -31,18 +28,18 @@ class JSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible name = "jsplib" description = "Job Shop Scheduling Problem benchmark library." url = "https://github.com/tamy0612/JSPLIB" - license = "" - citation = "" + citation = [ + "J. Adams, E. Balas, D. Zawack. 'The shifting bottleneck procedure for job shop scheduling.', Management Science, Vol. 34, Issue 3, pp. 391-401, 1988.", + "J.F. Muth, G.L. Thompson. 'Industrial scheduling.', Englewood Cliffs, NJ, Prentice-Hall, 1963.", + "S. Lawrence. 'Resource constrained project scheduling: an experimental investigation of heuristic scheduling techniques (Supplement).', Graduate School of Industrial Administration. Pittsburgh, Pennsylvania, Carnegie-Mellon University, 1984.", + "D. Applegate, W. Cook. 'A computational study of job-shop scheduling.', ORSA Journal on Computer, Vol. 3, Isuue 2, pp. 149-156, 1991.", + "R.H. Storer, S.D. Wu, R. Vaccari. 'New search spaces for sequencing problems with applications to job-shop scheduling.', Management Science Vol. 38, Issue 10, pp. 1495-1509, 1992.", + "T. Yamada, R. Nakano. 'A genetic algorithm applicable to large-scale job-shop problems.', Proceedings of the Second international workshop on parallel problem solving from Nature (PPSN'2). Brussels (Belgium), pp. 281-290, 1992.", + "E. Taillard. 'Benchmarks for basic scheduling problems', European Journal of Operational Research, Vol. 64, Issue 2, pp. 278-285, 1993.", + ] domain = "scheduling" format = "JSPLib" - origins = [] # Will be populated from config if available - @staticmethod - def _reader(file_path, open=open): - from cpmpy.tools.io.jsplib import read_jsplib - return read_jsplib(file_path, open=open) - - reader = _reader def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False): """ @@ -61,30 +58,25 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl dataset_dir = self.root / self.name - # Load origins from config - if not self.origins: - self.origins = get_origins(self.name) - super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension="" ) + + @staticmethod + def reader(file_path, open=open): + from cpmpy.tools.io.jsplib import read_jsplib + return read_jsplib(file_path, open=open) + def category(self) -> dict: return {} # no categories - def _list_instances(self): - """List JSPLib instances, excluding metadata and JSON files.""" - return sorted([ - f for f in self.dataset_dir.rglob("*") - if f.is_file() - and not str(f).endswith(self.METADATA_EXTENSION) - and not str(f).endswith(".json") - ]) - def collect_instance_metadata(self, file) -> dict: - """Extract metadata from instances.json and instance file header.""" + """ + Extract metadata from instances.json and instance file header. + """ # Lazy load the source metadata if self._source_metadata is None: source_path = self.dataset_dir / self._source_metadata_file @@ -129,24 +121,13 @@ def collect_instance_metadata(self, file) -> dict: break return result - def __getitem__(self, index): - """Supports both integer index and string name lookup.""" - if isinstance(index, str): - files = self._list_instances() - for file_path in files: - if file_path.stem == index: - idx = files.index(file_path) - return super().__getitem__(idx) - raise IndexError(f"Instance '{index}' not found in dataset") - return super().__getitem__(index) - def download(self): url = "https://github.com/tamy0612/JSPLIB/archive/refs/heads/" # download full repo... target = "master.zip" target_download_path = self.root / target - print(f"Downloading JSPLib instances from github.com/tamy0612/JSPLIB") + print("Downloading JSPLib instances from github.com/tamy0612/JSPLIB") try: target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) @@ -173,6 +154,18 @@ def download(self): def open(self, instance: os.PathLike) -> callable: return open(instance, "r") + def _list_instances(self): + """ + List JSPLib instances, excluding metadata and JSON files. + + Special overwrite due to JSPLib not using file extensions for its instances. + """ + return sorted([ + f for f in self.dataset_dir.rglob("*") + if f.is_file() + and not str(f).endswith(self.METADATA_EXTENSION) + and not str(f).endswith(".json") + ]) def parse_jsp(filename: str): """ diff --git a/cpmpy/tools/dataset/miplib.py b/cpmpy/tools/dataset/miplib.py index 67e7cc8f3..c6c77c467 100644 --- a/cpmpy/tools/dataset/miplib.py +++ b/cpmpy/tools/dataset/miplib.py @@ -12,7 +12,6 @@ import io from cpmpy.tools.dataset._base import _Dataset -from cpmpy.tools.dataset.config import get_origins class MIPLibDataset(_Dataset): # torch.utils.data.Dataset compatible @@ -26,58 +25,13 @@ class MIPLibDataset(_Dataset): # torch.utils.data.Dataset compatible name = "miplib" description = "Mixed Integer Programming Library benchmark instances." url = "https://miplib.zib.de/" - license = "" - citation = "" + citation = [ + "Gleixner, A., Hendel, G., Gamrath, G., Achterberg, T., Bastubbe, M., Berthold, T., Christophel, P. M., Jarck, K., Koch, T., Linderoth, J., Lubbecke, M., Mittelmann, H. D., Ozyurt, D., Ralphs, T. K., Salvagnin, D., and Shinano, Y. MIPLIB 2017: Data-Driven Compilation of the 6th Mixed-Integer Programming Library. Mathematical Programming Computation, 2021. https://doi.org/10.1007/s12532-020-00194-3.", + ] domain = "mixed integer programming" format = "MPS" - origins = [] # Will be populated from config if available - @staticmethod - def _reader(file_path, open=open): - from cpmpy.tools.io.scip import read_scip - return read_scip(file_path, open=open) - - reader = _reader - - def collect_instance_metadata(self, file) -> dict: - """Extract row/column counts from MPS file sections.""" - result = {} - try: - with self.open(file) as f: - section = None - num_rows = 0 - columns = set() - has_objective = False - for line in f: - stripped = line.strip() - if stripped.startswith("NAME"): - section = "NAME" - elif stripped == "ROWS": - section = "ROWS" - elif stripped == "COLUMNS": - section = "COLUMNS" - elif stripped in ("RHS", "RANGES", "BOUNDS", "ENDATA"): - section = stripped - elif section == "ROWS" and stripped: - parts = stripped.split() - if parts[0] == "N": - has_objective = True - else: - num_rows += 1 - elif section == "COLUMNS" and stripped: - parts = stripped.split() - if parts: - columns.add(parts[0]) - elif section in ("RHS", "RANGES", "BOUNDS", "ENDATA"): - pass # skip to avoid parsing entire file - if section == "ENDATA": - break - result["mps_num_rows"] = num_rows - result["mps_num_columns"] = len(columns) - result["mps_has_objective"] = has_objective - except Exception: - pass - return result + def __init__( self, @@ -108,16 +62,17 @@ def __init__( dataset_dir = self.root / self.name / str(year) / track - # Load origins from config - if not self.origins: - self.origins = get_origins(self.name) - super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=".mps.gz" ) + @staticmethod + def reader(file_path, open=open): + from cpmpy.tools.io.scip import read_scip + return read_scip(file_path, open=open) + def category(self) -> dict: return { "year": self.year, @@ -130,7 +85,7 @@ def download(self): target = "collection.zip" target_download_path = self.root / target - print(f"Downloading MIPLib instances from miplib.zib.de") + print("Downloading MIPLib instances from miplib.zib.de") try: target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) @@ -150,6 +105,46 @@ def download(self): # Clean up the zip file target_download_path.unlink() + def collect_instance_metadata(self, file) -> dict: + """Extract row/column counts from MPS file sections.""" + result = {} + try: + with self.open(file) as f: + section = None + num_rows = 0 + columns = set() + has_objective = False + for line in f: + stripped = line.strip() + if stripped.startswith("NAME"): + section = "NAME" + elif stripped == "ROWS": + section = "ROWS" + elif stripped == "COLUMNS": + section = "COLUMNS" + elif stripped in ("RHS", "RANGES", "BOUNDS", "ENDATA"): + section = stripped + elif section == "ROWS" and stripped: + parts = stripped.split() + if parts[0] == "N": + has_objective = True + else: + num_rows += 1 + elif section == "COLUMNS" and stripped: + parts = stripped.split() + if parts: + columns.add(parts[0]) + elif section in ("RHS", "RANGES", "BOUNDS", "ENDATA"): + pass # skip to avoid parsing entire file + if section == "ENDATA": + break + result["mps_num_rows"] = num_rows + result["mps_num_columns"] = len(columns) + result["mps_has_objective"] = has_objective + except Exception: + pass + return result + def open(self, instance: os.PathLike) -> io.TextIOBase: return gzip.open(instance, "rt") if str(instance).endswith(".gz") else open(instance) diff --git a/cpmpy/tools/dataset/mse.py b/cpmpy/tools/dataset/mse.py index 5c2aa8ad6..7403dea84 100644 --- a/cpmpy/tools/dataset/mse.py +++ b/cpmpy/tools/dataset/mse.py @@ -12,7 +12,6 @@ import io from cpmpy.tools.dataset._base import _Dataset -from cpmpy.tools.dataset.config import get_origins class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible @@ -32,18 +31,8 @@ class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible name = "mse" description = "MaxSAT Evaluation competition benchmark instances." url = "https://maxsat-evaluations.github.io/" - license = "" - citation = "" domain = "maximum satisfiability" - format = "WCNF" - origins = [] # Will be populated from config if available - @staticmethod - def _reader(file_path, open=open): - from cpmpy.tools.io.wcnf import read_wcnf - return read_wcnf(file_path, open=open) - - reader = _reader def __init__( self, @@ -80,10 +69,6 @@ def __init__( raise ValueError("Track must be specified, e.g. OPT-LIN, DEC-LIN, ...") dataset_dir = self.root / self.name / str(year) / track - - # Load origins from config - if not self.origins: - self.origins = get_origins(self.name) super().__init__( dataset_dir=dataset_dir, @@ -91,6 +76,12 @@ def __init__( download=download, extension=".wcnf.xz" ) + + @staticmethod + def reader(file_path, open=open): + from cpmpy.tools.io.wcnf import read_wcnf + return read_wcnf(file_path, open=open) + def category(self) -> dict: return { "year": self.year, @@ -98,7 +89,8 @@ def category(self) -> dict: } def collect_instance_metadata(self, file) -> dict: - """Extract statistics from WCNF header comments. + """ + Extract statistics from WCNF header comments. WCNF files from MSE contain JSON-like statistics in comment lines: nvars, ncls, nhards, nsofts, total_lits, nsoft_wts, and length stats. diff --git a/cpmpy/tools/dataset/nurserostering.py b/cpmpy/tools/dataset/nurserostering.py index a6d454c4f..21d3143c6 100644 --- a/cpmpy/tools/dataset/nurserostering.py +++ b/cpmpy/tools/dataset/nurserostering.py @@ -15,7 +15,6 @@ import cpmpy as cp from cpmpy.tools.dataset._base import _Dataset -from cpmpy.tools.dataset.config import get_origins # Optional dependencies try: @@ -52,14 +51,7 @@ class NurseRosteringDataset(_Dataset): # torch.utils.data.Dataset compatible format = "NRP text" - @staticmethod - def _reader(file_path, open=open): - from cpmpy.tools.io.nurserostering import read_nurserostering - return read_nurserostering(file_path, open=open) - - reader = _reader - - def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False, sort_key=None): + def __init__(self, root: str = ".", transform=None, target_transform=None, download:bool=False, sort_key=None): """ Initialize the Nurserostering Dataset. @@ -78,21 +70,24 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl dataset_dir = self.root / self.name - # Load origins from config - if not self.origins: - self.origins = get_origins(self.name) - super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=".txt" ) + @staticmethod + def reader(file_path, open=open): + from cpmpy.tools.io.nurserostering import read_nurserostering + return read_nurserostering(file_path, open=open) + def category(self) -> dict: return {} # no categories def collect_instance_metadata(self, file) -> dict: - """Extract scheduling metadata from nurse rostering instance.""" + """ + Extract scheduling metadata from nurse rostering instance. + """ try: data = parse_scheduling_period(file) return { @@ -114,7 +109,7 @@ def download(self): target = "instances1_24.zip" # download full repo... target_download_path = self.root / target - print(f"Downloading Nurserostering instances from schedulingbenchmarks.org") + print("Downloading Nurserostering instances from schedulingbenchmarks.org") try: target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) @@ -216,10 +211,15 @@ def parse_scheduling_period(filename: str): """ Parse a nurserostering instance file. - Args: - filename: Path to the nurserostering instance file. + Arguments: + filename (str): Path to the nurserostering instance file. - Returns a dictionary with native Python data structures (lists of dicts). + Returns: + dict: A dictionary with native Python data structures (lists of dicts). + + Raises: + ValueError: If the file is not found. + Use to_dataframes() transform to convert to pandas DataFrames if needed. Use add_fake_names() transform to add randomly generated names to staff. """ @@ -295,7 +295,7 @@ def parse_scheduling_period(filename: str): shift_on=shift_on, shift_off=shift_off, cover=cover) -def _add_fake_names(data, seed=0): +def add_fake_names(data, seed=0): """ Transform function to add randomly generated names to staff using Faker. @@ -316,12 +316,12 @@ def _add_fake_names(data, seed=0): ) ) - Args: - data: Dictionary returned by parse_scheduling_period() - seed: Random seed for reproducible name generation (default: 0) + Arguments: + data (dict): Dictionary returned by parse_scheduling_period() + seed (int): Random seed for reproducible name generation (default: 0) Returns: - Dictionary with 'name' field added to each staff member + dict: Dictionary with 'name' field added to each staff member Raises: ImportError: If Faker is not installed @@ -339,7 +339,7 @@ def _add_fake_names(data, seed=0): return data -def _to_dataframes(data): +def to_dataframes(data): """ Transform function to convert native data structures to pandas DataFrames. @@ -352,11 +352,11 @@ def _to_dataframes(data): transform=lambda fname: to_dataframes(parse_scheduling_period(fname)) ) - Args: - data: Dictionary returned by parse_scheduling_period() + Arguments: + data (dict): Dictionary returned by parse_scheduling_period() Returns: - Dictionary with pandas DataFrames instead of native structures + dict: Dictionary with pandas DataFrames instead of native structures Raises: ImportError: If pandas is not installed diff --git a/cpmpy/tools/dataset/opb.py b/cpmpy/tools/dataset/opb.py index 429161c7f..e4e05df3c 100644 --- a/cpmpy/tools/dataset/opb.py +++ b/cpmpy/tools/dataset/opb.py @@ -12,7 +12,6 @@ import io from cpmpy.tools.dataset._base import _Dataset -from cpmpy.tools.dataset.config import get_origins class OPBDataset(_Dataset): @@ -31,18 +30,8 @@ class OPBDataset(_Dataset): name = "opb" description = "Pseudo-Boolean Competition benchmark instances." url = "https://www.cril.univ-artois.fr/PB25/" - license = "" - citation = "" domain = "pseudo-boolean optimization" format = "OPB" - origins = [] # Will be populated from config if available - - @staticmethod - def _reader(file_path, open=open): - from cpmpy.tools.io.opb import read_opb - return read_opb(file_path, open=open) - - reader = _reader def __init__( self, @@ -82,10 +71,6 @@ def __init__( raise ValueError("Track must be specified, e.g. exact-weighted, exact-unweighted, ...") dataset_dir = self.root / self.name / str(year) / track / ('selected' if self.competition else 'normalized') - - # Load origins from config - if not self.origins: - self.origins = get_origins(self.name) super().__init__( dataset_dir=dataset_dir, @@ -93,6 +78,12 @@ def __init__( download=download, extension=".opb.xz" ) + + @staticmethod + def reader(file_path, open=open): + from cpmpy.tools.io.opb import read_opb + return read_opb(file_path, open=open) + def category(self) -> dict: return { "year": self.year, @@ -155,7 +146,7 @@ def download(self): break if main_folder is None: - raise ValueError(f"Could not find main folder in tar file") + raise ValueError("Could not find main folder in tar file") # Extract only files from the specified track # Get all unique track names from tar diff --git a/cpmpy/tools/dataset/psplib.py b/cpmpy/tools/dataset/psplib.py index aaa019d76..61f9e5bf5 100644 --- a/cpmpy/tools/dataset/psplib.py +++ b/cpmpy/tools/dataset/psplib.py @@ -10,7 +10,6 @@ import zipfile from cpmpy.tools.dataset._base import _Dataset -from cpmpy.tools.dataset.config import get_origins class PSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible """ @@ -22,18 +21,9 @@ class PSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible name = "psplib" description = "Project Scheduling Problem Library (RCPSP) benchmark instances." url = "https://www.om-db.wi.tum.de/psplib/main.html" - license = "" - citation = "" domain = "scheduling" format = "PSPLIB SM" - origins = [] # Will be populated from config if available - @staticmethod - def _reader(file_path, open=open): - from cpmpy.tools.io.rcpsp import read_rcpsp - return read_rcpsp(file_path, open=open) - - reader = _reader def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False): """ @@ -69,15 +59,16 @@ def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", dataset_dir = self.root / self.name / self.variant / self.family - # Load origins from config - if not self.origins: - self.origins = get_origins(self.name) - super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=f".{self.family_codes[self.variant]}" ) + + @staticmethod + def reader(file_path, open=open): + from cpmpy.tools.io.rcpsp import read_rcpsp + return read_rcpsp(file_path, open=open) def category(self) -> dict: return { @@ -116,7 +107,10 @@ def collect_instance_metadata(self, file) -> dict: elif line.startswith("- doubly constrained"): match = re.search(r':\s*(\d+)', line) if match: - result["num_doubly_constrained_resources"] = int(match.group(1)) + result["num_doubly_constrained_resources"] = @staticmethod + def reader(file_path, open=open): + from cpmpy.tools.io.rcpsp import read_rcpsp + return read_rcpsp(file_path, open=open)int(match.group(1)) elif line.startswith("PROJECT INFORMATION"): in_project_info = True elif in_project_info and not line.startswith("*") and not line.startswith("pronr"): diff --git a/cpmpy/tools/dataset/xcsp3.py b/cpmpy/tools/dataset/xcsp3.py index e39dbf726..4bb200bd8 100644 --- a/cpmpy/tools/dataset/xcsp3.py +++ b/cpmpy/tools/dataset/xcsp3.py @@ -11,7 +11,6 @@ import io from cpmpy.tools.dataset._base import _Dataset -from cpmpy.tools.dataset.config import get_origins class XCSP3Dataset(_Dataset): # torch.utils.data.Dataset compatible @@ -31,43 +30,14 @@ class XCSP3Dataset(_Dataset): # torch.utils.data.Dataset compatible name = "xcsp3" description = "XCSP3 competition benchmark instances for constraint satisfaction and optimization." url = "https://xcsp.org/instances/" - license = "" - citation = "" domain = "constraint programming" format = "XCSP3" - origins = [] # Will be populated from config if available - @staticmethod - def _reader(file_path, open=open): - from cpmpy.tools.xcsp3.parser import read_xcsp3 - return read_xcsp3(file_path, open=open) + - reader = _reader - def collect_instance_metadata(self, file) -> dict: - """Extract instance type (CSP/COP) from XCSP3 XML root element.""" - import re - result = {} - try: - with self.open(file) as f: - # Read only the first few lines to find the root element - header = "" - for _ in range(10): - line = f.readline() - if not line: - break - header += line - if ">" in line: - break - match = re.search(r'type\s*=\s*"([^"]+)"', header) - if match: - result["instance_type"] = match.group(1) - match = re.search(r'format\s*=\s*"([^"]+)"', header) - if match: - result["xcsp_format"] = match.group(1) - except Exception: - pass - return result + + def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transform=None, target_transform=None, download: bool = False): """ @@ -84,10 +54,6 @@ def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transf raise ValueError("Year must start with '20'") if not track: raise ValueError("Track must be specified, e.g. COP, CSP, MiniCOP, ...") - - # Load origins from config - if not self.origins: - self.origins = get_origins(self.name) super().__init__( dataset_dir=dataset_dir, @@ -95,12 +61,43 @@ def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transf download=download, extension=".xml.lzma" ) + + @classmethod + def reader(file_path, open=open): + from cpmpy.tools.xcsp3.parser import read_xcsp3 + return read_xcsp3(file_path, open=open) + def category(self) -> dict: return { "year": self.year, "track": self.track } + def collect_instance_metadata(self, file) -> dict: + """Extract instance type (CSP/COP) from XCSP3 XML root element.""" + import re + result = {} + try: + with self.open(file) as f: + # Read only the first few lines to find the root element + header = "" + for _ in range(10): + line = f.readline() + if not line: + break + header += line + if ">" in line: + break + match = re.search(r'type\s*=\s*"([^"]+)"', header) + if match: + result["instance_type"] = match.group(1) + match = re.search(r'format\s*=\s*"([^"]+)"', header) + if match: + result["xcsp_format"] = match.group(1) + except Exception: + pass + return result + def download(self): url = "https://www.cril.univ-artois.fr/~lecoutre/compets/" @@ -124,7 +121,7 @@ def download(self): break if main_folder is None: - raise ValueError(f"Could not find main folder in zip file") + raise ValueError("Could not find main folder in zip file") # Extract only files from the specified track # Get all unique track names from zip From a86544f20a80a950b002c74f940737b54b0035b2 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Mon, 16 Feb 2026 12:36:28 +0100 Subject: [PATCH 113/152] Remove domain and format tags --- cpmpy/tools/dataset/_base.py | 12 +++++++++--- cpmpy/tools/dataset/jsplib.py | 3 --- cpmpy/tools/dataset/miplib.py | 5 +---- cpmpy/tools/dataset/mse.py | 2 -- cpmpy/tools/dataset/nurserostering.py | 3 --- cpmpy/tools/dataset/opb.py | 2 -- cpmpy/tools/dataset/psplib.py | 3 --- cpmpy/tools/dataset/xcsp3.py | 7 ------- 8 files changed, 10 insertions(+), 27 deletions(-) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index b53be45fe..7def2eefe 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -24,6 +24,7 @@ except ImportError: tqdm = None +import cpmpy as cp # Fields produced by extract_model_features() — not portable across format translations _MODEL_FEATURE_FIELDS = frozenset({ @@ -154,9 +155,6 @@ class _Dataset(ABC): url = "" license = "" citation: List[str] = [] - domain = "" - format = "" - reader = None # callable(file_path, open=open) -> cp.Model # Multiple download origins (override in subclasses or via config) # Origins are tried in order, falling back to original url if all fail @@ -211,6 +209,14 @@ def __init__( # Methods to implement in subclasses: # # ---------------------------------------------------------------------------- # + @staticmethod + @abstractmethod + def reader(file_path, open=open) -> cp.Model: + """ + Reader for the dataset. + """ + pass + @abstractmethod def category(self) -> dict: """ diff --git a/cpmpy/tools/dataset/jsplib.py b/cpmpy/tools/dataset/jsplib.py index 8cbc2289b..3ce46daf3 100644 --- a/cpmpy/tools/dataset/jsplib.py +++ b/cpmpy/tools/dataset/jsplib.py @@ -37,9 +37,6 @@ class JSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible "T. Yamada, R. Nakano. 'A genetic algorithm applicable to large-scale job-shop problems.', Proceedings of the Second international workshop on parallel problem solving from Nature (PPSN'2). Brussels (Belgium), pp. 281-290, 1992.", "E. Taillard. 'Benchmarks for basic scheduling problems', European Journal of Operational Research, Vol. 64, Issue 2, pp. 278-285, 1993.", ] - domain = "scheduling" - format = "JSPLib" - def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False): """ diff --git a/cpmpy/tools/dataset/miplib.py b/cpmpy/tools/dataset/miplib.py index c6c77c467..333b6241f 100644 --- a/cpmpy/tools/dataset/miplib.py +++ b/cpmpy/tools/dataset/miplib.py @@ -28,10 +28,7 @@ class MIPLibDataset(_Dataset): # torch.utils.data.Dataset compatible citation = [ "Gleixner, A., Hendel, G., Gamrath, G., Achterberg, T., Bastubbe, M., Berthold, T., Christophel, P. M., Jarck, K., Koch, T., Linderoth, J., Lubbecke, M., Mittelmann, H. D., Ozyurt, D., Ralphs, T. K., Salvagnin, D., and Shinano, Y. MIPLIB 2017: Data-Driven Compilation of the 6th Mixed-Integer Programming Library. Mathematical Programming Computation, 2021. https://doi.org/10.1007/s12532-020-00194-3.", ] - domain = "mixed integer programming" - format = "MPS" - - + def __init__( self, diff --git a/cpmpy/tools/dataset/mse.py b/cpmpy/tools/dataset/mse.py index 7403dea84..68752ca94 100644 --- a/cpmpy/tools/dataset/mse.py +++ b/cpmpy/tools/dataset/mse.py @@ -31,8 +31,6 @@ class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible name = "mse" description = "MaxSAT Evaluation competition benchmark instances." url = "https://maxsat-evaluations.github.io/" - domain = "maximum satisfiability" - def __init__( self, diff --git a/cpmpy/tools/dataset/nurserostering.py b/cpmpy/tools/dataset/nurserostering.py index 21d3143c6..e4b431988 100644 --- a/cpmpy/tools/dataset/nurserostering.py +++ b/cpmpy/tools/dataset/nurserostering.py @@ -47,9 +47,6 @@ class NurseRosteringDataset(_Dataset): # torch.utils.data.Dataset compatible "Smet P. Constraint reformulation for nurse rostering problems, in: PATAT 2018 twelfth international conference on the practice and theory of automated timetabling, Vienna, August, 2018, p. 69-80.", "Rahimian, E., Akartunali, K., and Levine, J. A hybrid integer programming and variable neighbourhood search algorithm to solve nurse rostering problems. European Journal of Operational Research, 2017. 258(2): p. 411-423.", ] - domain = "scheduling" - format = "NRP text" - def __init__(self, root: str = ".", transform=None, target_transform=None, download:bool=False, sort_key=None): """ diff --git a/cpmpy/tools/dataset/opb.py b/cpmpy/tools/dataset/opb.py index e4e05df3c..16390e2df 100644 --- a/cpmpy/tools/dataset/opb.py +++ b/cpmpy/tools/dataset/opb.py @@ -30,8 +30,6 @@ class OPBDataset(_Dataset): name = "opb" description = "Pseudo-Boolean Competition benchmark instances." url = "https://www.cril.univ-artois.fr/PB25/" - domain = "pseudo-boolean optimization" - format = "OPB" def __init__( self, diff --git a/cpmpy/tools/dataset/psplib.py b/cpmpy/tools/dataset/psplib.py index 61f9e5bf5..52854504f 100644 --- a/cpmpy/tools/dataset/psplib.py +++ b/cpmpy/tools/dataset/psplib.py @@ -21,9 +21,6 @@ class PSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible name = "psplib" description = "Project Scheduling Problem Library (RCPSP) benchmark instances." url = "https://www.om-db.wi.tum.de/psplib/main.html" - domain = "scheduling" - format = "PSPLIB SM" - def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False): """ diff --git a/cpmpy/tools/dataset/xcsp3.py b/cpmpy/tools/dataset/xcsp3.py index 4bb200bd8..e8ff5c68e 100644 --- a/cpmpy/tools/dataset/xcsp3.py +++ b/cpmpy/tools/dataset/xcsp3.py @@ -30,13 +30,6 @@ class XCSP3Dataset(_Dataset): # torch.utils.data.Dataset compatible name = "xcsp3" description = "XCSP3 competition benchmark instances for constraint satisfaction and optimization." url = "https://xcsp.org/instances/" - domain = "constraint programming" - format = "XCSP3" - - - - - def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transform=None, target_transform=None, download: bool = False): From ece1fc703d0cfa0b9b5a6ea0b6e526f31fce0484 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 17 Feb 2026 11:23:17 +0100 Subject: [PATCH 114/152] consistent reader/parser/loader naming --- cpmpy/tools/dataset/__init__.py | 4 +- cpmpy/tools/dataset/_base.py | 52 +++++++- cpmpy/tools/dataset/jsplib.py | 19 ++- cpmpy/tools/dataset/miplib.py | 30 ++++- cpmpy/tools/dataset/mse.py | 19 ++- cpmpy/tools/dataset/nurserostering.py | 19 ++- cpmpy/tools/dataset/opb.py | 19 ++- cpmpy/tools/dataset/psplib.py | 25 +++- cpmpy/tools/dataset/transforms.py | 181 ++++++++++++++++++-------- cpmpy/tools/dataset/xcsp3.py | 25 +++- cpmpy/tools/dimacs.py | 10 +- cpmpy/tools/io/__init__.py | 18 +-- cpmpy/tools/io/jsplib.py | 10 +- cpmpy/tools/io/nurserostering.py | 10 +- cpmpy/tools/io/opb.py | 7 +- cpmpy/tools/io/rcpsp.py | 10 +- cpmpy/tools/io/reader.py | 55 ++++---- cpmpy/tools/io/scip.py | 10 +- cpmpy/tools/io/wcnf.py | 10 +- cpmpy/tools/xcsp3/__init__.py | 2 +- cpmpy/tools/xcsp3/parser.py | 7 +- 21 files changed, 415 insertions(+), 127 deletions(-) diff --git a/cpmpy/tools/dataset/__init__.py b/cpmpy/tools/dataset/__init__.py index 26099b64a..cc4ca2857 100644 --- a/cpmpy/tools/dataset/__init__.py +++ b/cpmpy/tools/dataset/__init__.py @@ -6,4 +6,6 @@ from .xcsp3 import XCSP3Dataset from .opb import OPBDataset from .mse import MSEDataset -from .transforms import Compose, Open, Parse, Serialize, Translate, SaveToFile, Lambda, extract_format_metadata +from .transforms import Compose, Open, Load, Serialize, Translate, SaveToFile, Lambda, extract_format_metadata +# Backward compatibility alias +Parse = Load diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/dataset/_base.py index 7def2eefe..e17f9939c 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/dataset/_base.py @@ -214,6 +214,26 @@ def __init__( def reader(file_path, open=open) -> cp.Model: """ Reader for the dataset. + Parses a file path directly into a CPMpy model. + For backward compatibility. Consider using read() + load() instead. + """ + pass + + @staticmethod + @abstractmethod + def loader(content: str) -> cp.Model: + """ + Loader for the dataset. + Loads a CPMpy model from raw file content string. + + This is the "loading" step: turning raw contents into a CPMpy model. + The content should be the raw text content of the file (already decompressed). + + Arguments: + content (str): Raw file content string to load into a model. + + Returns: + cp.Model: The loaded CPMpy model. """ pass @@ -252,7 +272,8 @@ def collect_instance_metadata(self, file: pathlib.Path) -> dict: """ return {} - def open(self, instance) -> io.TextIOBase: + @classmethod + def open(cls, instance) -> io.TextIOBase: """ How an instance file from the dataset should be opened. Especially usefull when files come compressed and won't work with @@ -260,6 +281,35 @@ def open(self, instance) -> io.TextIOBase: """ return open(instance, "r") + def read(self, instance) -> str: + """ + Read raw file contents from an instance file. + Handles decompression automatically via dataset.open(). + + This is the "reading" step: decompressing + reading raw file contents. + """ + with self.open(instance) as f: + return f.read() + + def load(self, instance) -> cp.Model: + """ + Load a CPMpy model from an instance file. + + This is the "loading" step: uses `read()` to handle reading (decompressing + + reading raw contents) and then turns raw contents into a CPMpy model via `loader()`. + Loading always handles reading internally by calling `read()`. + + Arguments: + instance: File path to the instance file. + + Returns: + cp.Model: The loaded CPMpy model. + """ + # Step 1: Reading - use read() to decompress and read raw file contents + content = self.read(instance) + # Step 2: Loading - turn raw contents into CPMpy model + return self.loader(content) + # ---------------------------------------------------------------------------- # # Public interface # diff --git a/cpmpy/tools/dataset/jsplib.py b/cpmpy/tools/dataset/jsplib.py index 3ce46daf3..d95aeddf1 100644 --- a/cpmpy/tools/dataset/jsplib.py +++ b/cpmpy/tools/dataset/jsplib.py @@ -64,8 +64,23 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl @staticmethod def reader(file_path, open=open): - from cpmpy.tools.io.jsplib import read_jsplib - return read_jsplib(file_path, open=open) + """ + Reader for JSPLib dataset. + Parses a file path directly into a CPMpy model. + For backward compatibility. Consider using read() + load() instead. + """ + from cpmpy.tools.io.jsplib import load_jsplib + return load_jsplib(file_path, open=open) + + @staticmethod + def loader(content: str): + """ + Loader for JSPLib dataset. + Loads a CPMpy model from raw JSPLib content string. + """ + from cpmpy.tools.io.jsplib import load_jsplib + # load_jsplib already supports raw strings + return load_jsplib(content) def category(self) -> dict: return {} # no categories diff --git a/cpmpy/tools/dataset/miplib.py b/cpmpy/tools/dataset/miplib.py index 333b6241f..40bc676ab 100644 --- a/cpmpy/tools/dataset/miplib.py +++ b/cpmpy/tools/dataset/miplib.py @@ -67,8 +67,34 @@ def __init__( @staticmethod def reader(file_path, open=open): - from cpmpy.tools.io.scip import read_scip - return read_scip(file_path, open=open) + """ + Reader for MIPLib dataset. + Parses a file path directly into a CPMpy model. + For backward compatibility. Consider using read() + load() instead. + """ + from cpmpy.tools.io.scip import load_scip + return load_scip(file_path, open=open) + + @staticmethod + def loader(content: str): + """ + Loader for MIPLib dataset. + Loads a CPMpy model from raw MPS/LP content string. + Note: SCIP requires a file, so content is written to a temporary file. + """ + import tempfile + import os + from cpmpy.tools.io.scip import load_scip + + # SCIP requires a file path, so write content to temp file + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.mps') as tmp: + tmp.write(content) + tmp_path = tmp.name + + try: + return load_scip(tmp_path) + finally: + os.unlink(tmp_path) def category(self) -> dict: return { diff --git a/cpmpy/tools/dataset/mse.py b/cpmpy/tools/dataset/mse.py index 68752ca94..04f4f89f1 100644 --- a/cpmpy/tools/dataset/mse.py +++ b/cpmpy/tools/dataset/mse.py @@ -77,8 +77,23 @@ def __init__( @staticmethod def reader(file_path, open=open): - from cpmpy.tools.io.wcnf import read_wcnf - return read_wcnf(file_path, open=open) + """ + Reader for MSE dataset. + Parses a file path directly into a CPMpy model. + For backward compatibility. Consider using read() + load() instead. + """ + from cpmpy.tools.io.wcnf import load_wcnf + return load_wcnf(file_path, open=open) + + @staticmethod + def loader(content: str): + """ + Loader for MSE dataset. + Loads a CPMpy model from raw WCNF content string. + """ + from cpmpy.tools.io.wcnf import load_wcnf + # load_wcnf already supports raw strings + return load_wcnf(content) def category(self) -> dict: return { diff --git a/cpmpy/tools/dataset/nurserostering.py b/cpmpy/tools/dataset/nurserostering.py index e4b431988..015db79b3 100644 --- a/cpmpy/tools/dataset/nurserostering.py +++ b/cpmpy/tools/dataset/nurserostering.py @@ -75,8 +75,23 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl @staticmethod def reader(file_path, open=open): - from cpmpy.tools.io.nurserostering import read_nurserostering - return read_nurserostering(file_path, open=open) + """ + Reader for Nurse Rostering dataset. + Parses a file path directly into a CPMpy model. + For backward compatibility. Consider using read() + load() instead. + """ + from cpmpy.tools.io.nurserostering import load_nurserostering + return load_nurserostering(file_path, open=open) + + @staticmethod + def loader(content: str): + """ + Loader for Nurse Rostering dataset. + Loads a CPMpy model from raw Nurse Rostering content string. + """ + from cpmpy.tools.io.nurserostering import load_nurserostering + # load_nurserostering already supports raw strings + return load_nurserostering(content) def category(self) -> dict: return {} # no categories diff --git a/cpmpy/tools/dataset/opb.py b/cpmpy/tools/dataset/opb.py index 16390e2df..46ecaa932 100644 --- a/cpmpy/tools/dataset/opb.py +++ b/cpmpy/tools/dataset/opb.py @@ -79,8 +79,23 @@ def __init__( @staticmethod def reader(file_path, open=open): - from cpmpy.tools.io.opb import read_opb - return read_opb(file_path, open=open) + """ + Reader for OPB dataset. + Parses a file path directly into a CPMpy model. + For backward compatibility. Consider using read() + load() instead. + """ + from cpmpy.tools.io.opb import load_opb + return load_opb(file_path, open=open) + + @staticmethod + def loader(content: str): + """ + Loader for OPB dataset. + Loads a CPMpy model from raw OPB content string. + """ + from cpmpy.tools.io.opb import load_opb + # load_opb already supports raw strings + return load_opb(content) def category(self) -> dict: return { diff --git a/cpmpy/tools/dataset/psplib.py b/cpmpy/tools/dataset/psplib.py index 52854504f..cd9c4e1c1 100644 --- a/cpmpy/tools/dataset/psplib.py +++ b/cpmpy/tools/dataset/psplib.py @@ -22,6 +22,7 @@ class PSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible description = "Project Scheduling Problem Library (RCPSP) benchmark instances." url = "https://www.om-db.wi.tum.de/psplib/main.html" + def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False): """ Constructor for a dataset object for PSPlib. @@ -64,8 +65,23 @@ def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", @staticmethod def reader(file_path, open=open): - from cpmpy.tools.io.rcpsp import read_rcpsp - return read_rcpsp(file_path, open=open) + """ + Reader for PSPLib dataset. + Parses a file path directly into a CPMpy model. + For backward compatibility. Consider using read() + load() instead. + """ + from cpmpy.tools.io.rcpsp import load_rcpsp + return load_rcpsp(file_path, open=open) + + @staticmethod + def loader(content: str): + """ + Loader for PSPLib dataset. + Loads a CPMpy model from raw RCPSP content string. + """ + from cpmpy.tools.io.rcpsp import load_rcpsp + # load_rcpsp already supports raw strings + return load_rcpsp(content) def category(self) -> dict: return { @@ -104,10 +120,7 @@ def collect_instance_metadata(self, file) -> dict: elif line.startswith("- doubly constrained"): match = re.search(r':\s*(\d+)', line) if match: - result["num_doubly_constrained_resources"] = @staticmethod - def reader(file_path, open=open): - from cpmpy.tools.io.rcpsp import read_rcpsp - return read_rcpsp(file_path, open=open)int(match.group(1)) + result["num_doubly_constrained_resources"] = int(match.group(1)) elif line.startswith("PROJECT INFORMATION"): in_project_info = True elif in_project_info and not line.startswith("*") and not line.startswith("pronr"): diff --git a/cpmpy/tools/dataset/transforms.py b/cpmpy/tools/dataset/transforms.py index b7205f5dc..bfcff8ba2 100644 --- a/cpmpy/tools/dataset/transforms.py +++ b/cpmpy/tools/dataset/transforms.py @@ -14,7 +14,7 @@ Compose Open - Parse + Load Serialize Translate SaveToFile @@ -22,14 +22,14 @@ Example usage:: - from cpmpy.tools.dataset import MSEDataset, Compose, Parse, Serialize - from cpmpy.tools.io.wcnf import read_wcnf + from cpmpy.tools.dataset import MSEDataset, Compose, Load, Serialize + from cpmpy.tools.io.wcnf import load_wcnf dataset = MSEDataset(root=".", year=2024, track="exact-weighted") - # Chain: parse WCNF files, then serialize to DIMACS + # Chain: load WCNF files, then serialize to DIMACS transform = Compose([ - Parse(read_wcnf, open=dataset.open), + Load(load_wcnf, open=dataset.open), Serialize("dimacs"), ]) dataset.transform = transform @@ -128,7 +128,7 @@ def extract_format_metadata(content, format_name): def _enrich_from_model(model, metadata): """Add decision variable and objective info from a CPMpy Model to metadata. - This is called by transforms that produce CPMpy models (Parse, Translate) + This is called by transforms that produce CPMpy models (Load, Translate) via their ``enrich_metadata`` method. It adds: - ``decision_variables``: list of dicts with name, type, lb, ub for each variable @@ -167,7 +167,7 @@ class Compose: Transforms that define ``enrich_metadata(data, metadata)`` can contribute additional fields to the metadata dictionary. Each sub-transform's ``enrich_metadata`` receives the intermediate result *it* produced, so a - :class:`Parse` inside ``Compose([Parse(...), Serialize(...)])`` sees the + :class:`Load` inside ``Compose([Load(...), Serialize(...)])`` sees the CPMpy model, not the final serialized string. Arguments: @@ -176,7 +176,7 @@ class Compose: Example:: >>> transform = Compose([ - ... Parse(read_wcnf, open=dataset.open), + ... Load(load_wcnf, open=dataset.open), ... Serialize("dimacs"), ... ]) >>> dataset = MSEDataset(transform=transform) @@ -240,47 +240,70 @@ def __repr__(self): return f"{self.__class__.__name__}(open={self._open})" -class Parse: +class Load: """ - Transform that parses a file path into a CPMpy model using a reader function. + Transform that loads a file path into a CPMpy model. + + Loading always handles reading internally. This transform combines reading + (decompressing + reading raw contents) and loading (turning raw contents + into a CPMpy model) into a single step. Implements ``enrich_metadata`` to add model verification information (decision variables, objective) to the metadata dictionary. This is called automatically by the dataset's ``__getitem__``. Arguments: - reader (callable): A reader function such as ``read_wcnf``, ``read_opb``, - ``read_scip``, ``read_dimacs``, etc. - open (callable, optional): A callable to open files, passed to the reader - as the ``open`` keyword argument. If None, the reader uses its default. - **kwargs: Additional keyword arguments passed to the reader. + loader (callable): A loader function that takes raw content string and + returns a CPMpy model. Can be a dataset's ``loader`` method or a + loader function that supports raw strings (e.g., ``load_wcnf``, + ``load_opb``, ``load_xcsp3``, etc.). + open (callable, optional): A callable to open files for reading. + Typically ``dataset.open``. Defaults to Python's built-in ``open``. + **kwargs: Additional keyword arguments passed to the loader (if supported). Example:: - >>> from cpmpy.tools.io.wcnf import read_wcnf - >>> dataset = MSEDataset(transform=Parse(read_wcnf, open=dataset.open)) + >>> # Using dataset's loader method + >>> dataset = MSEDataset(transform=Load(dataset.loader, open=dataset.open)) + >>> model, metadata = dataset[0] + + >>> # Using a loader function that supports raw strings + >>> from cpmpy.tools.io.wcnf import load_wcnf + >>> dataset = MSEDataset(transform=Load(load_wcnf, open=dataset.open)) >>> model, metadata = dataset[0] >>> metadata['decision_variables'] # list of variable descriptors >>> metadata['objective'] # objective expression string (if any) """ - def __init__(self, reader, open=None, **kwargs): - self.reader = reader - self._open = open + def __init__(self, loader, open=None, **kwargs): + self.loader = loader + self._open = open if open is not None else _builtins_open self.kwargs = kwargs def __call__(self, file_path): - if self._open is not None: - return self.reader(file_path, open=self._open, **self.kwargs) - return self.reader(file_path, **self.kwargs) + # Step 1: Reading - decompress and read raw file contents + with self._open(file_path) as f: + content = f.read() + + # Step 2: Loading - turn raw contents into CPMpy model + # Prepare kwargs, ensuring 'open' doesn't conflict + kwargs = {k: v for k, v in self.kwargs.items() if k != 'open'} + + # Handle both regular functions and classmethods/staticmethods + if hasattr(self.loader, '__self__') or isinstance(self.loader, classmethod): + # It's a bound method or classmethod, call it directly + return self.loader(content, **kwargs) + else: + # It's a regular function, call it normally + return self.loader(content, **kwargs) def enrich_metadata(self, data, metadata): """Add model verification info if data is a CPMpy Model.""" return _enrich_from_model(data, metadata) def __repr__(self): - reader_name = getattr(self.reader, '__name__', repr(self.reader)) - return f"{self.__class__.__name__}(reader={reader_name})" + loader_name = getattr(self.loader, '__name__', repr(self.loader)) + return f"{self.__class__.__name__}(loader={loader_name})" class Serialize: @@ -288,71 +311,119 @@ class Serialize: Transform that serializes a CPMpy model to a string in a given format. Arguments: - format (str): Output format name (e.g., ``"dimacs"``, ``"mps"``, ``"opb"``). - Must be a format supported by :func:`cpmpy.tools.io.writer.write`. + writer (callable or str): Either a writer function (e.g., ``write_dimacs``, ``write_opb``) + or a format name string (e.g., ``"dimacs"``, ``"mps"``, ``"opb"``) that will be resolved + to the appropriate writer function. If a string, must be a format supported by + :func:`cpmpy.tools.io.writer.write`. **kwargs: Additional keyword arguments passed to the writer (e.g., ``header``, ``verbose``). Example:: + >>> # Using format name string >>> transform = Compose([ - ... Parse(read_wcnf, open=dataset.open), + ... Load(load_wcnf, open=dataset.open), ... Serialize("dimacs"), ... ]) + + >>> # Using writer function directly + >>> from cpmpy.tools.dimacs import write_dimacs + >>> transform = Compose([ + ... Load(load_wcnf, open=dataset.open), + ... Serialize(write_dimacs), + ... ]) """ - def __init__(self, format, **kwargs): - self.format = format + def __init__(self, writer, **kwargs): + self.writer = writer self.kwargs = kwargs def __call__(self, model): - from cpmpy.tools.io.writer import write - return write(model, format=self.format, file_path=None, **self.kwargs) + # Determine writer function + if callable(self.writer): + # writer is a callable function + return self.writer(model, fname=None, **self.kwargs) + else: + # writer is a format name string, use unified write function + from cpmpy.tools.io.writer import write + return write(model, format=self.writer, file_path=None, **self.kwargs) def __repr__(self): - return f"{self.__class__.__name__}(format='{self.format}')" + if callable(self.writer): + writer_name = getattr(self.writer, '__name__', repr(self.writer)) + return f"{self.__class__.__name__}(writer={writer_name})" + else: + return f"{self.__class__.__name__}(writer='{self.writer}')" class Translate: """ Transform that translates a file from one format to another. - Combines reading (parsing) and writing (serialization) in one step. + Combines reading (decompressing + reading raw contents), loading (turning raw + contents into a CPMpy model), and writing (serializing the model) in one step. Implements ``enrich_metadata`` to add model verification information from the intermediate CPMpy model to the metadata dictionary. Arguments: - reader (callable): A reader function (e.g., ``read_wcnf``, ``read_opb``). - format (str): Output format name (e.g., ``"dimacs"``, ``"mps"``). - open (callable, optional): A callable to open compressed files, - passed to the reader. + loader (callable): A loader function that takes raw content string and + returns a CPMpy model. Can be a dataset's ``loader`` method or a + loader function that supports raw strings (e.g., ``load_wcnf``, + ``read_opb``, ``read_xcsp3``, etc.). + writer (callable or str): Either a writer function (e.g., ``write_dimacs``, ``write_opb``) + or a format name string (e.g., ``"dimacs"``, ``"mps"``) that will be resolved + to the appropriate writer function. + open (callable, optional): A callable to open compressed files for reading. + Typically ``dataset.open``. Defaults to Python's built-in ``open``. **kwargs: Additional keyword arguments passed to the writer. Example:: - >>> transform = Translate(read_wcnf, "dimacs", open=dataset.open) + >>> # Using format name string + >>> transform = Translate(dataset.loader, "dimacs", open=dataset.open) + >>> dataset = MSEDataset(transform=transform) + >>> dimacs_string, metadata = dataset[0] + + >>> # Using writer function directly + >>> from cpmpy.tools.dimacs import write_dimacs + >>> transform = Translate(dataset.loader, write_dimacs, open=dataset.open) >>> dataset = MSEDataset(transform=transform) >>> dimacs_string, metadata = dataset[0] >>> metadata['decision_variables'] # from the intermediate model """ - def __init__(self, reader, format, open=None, **kwargs): - self.reader = reader - self.format = format - self._open = open + def __init__(self, loader, writer, open=None, **kwargs): + self.loader = loader + self.writer = writer + self._open = open if open is not None else _builtins_open self.kwargs = kwargs self._last_model = None def __call__(self, file_path): - from cpmpy.tools.io.writer import write - - if self._open is not None: - model = self.reader(file_path, open=self._open) + # Step 1: Reading - decompress and read raw file contents + with self._open(file_path) as f: + content = f.read() + + # Step 2: Loading - turn raw contents into CPMpy model + loader_kwargs = {k: v for k, v in self.kwargs.items() if k != 'open'} + + # Handle both regular functions and classmethods/staticmethods + if hasattr(self.loader, '__self__') or isinstance(self.loader, classmethod): + model = self.loader(content, **loader_kwargs) else: - model = self.reader(file_path) + model = self.loader(content, **loader_kwargs) self._last_model = model - return write(model, format=self.format, file_path=None, **self.kwargs) + + # Step 3: Writing - serialize model to string + writer_kwargs = {k: v for k, v in self.kwargs.items() if k != 'open'} + if callable(self.writer): + # writer is a callable function + return self.writer(model, fname=None, **writer_kwargs) + else: + # writer is a format name string, use unified write function + from cpmpy.tools.io.writer import write + return write(model, format=self.writer, file_path=None, **writer_kwargs) def enrich_metadata(self, data, metadata): """Add model verification info from the intermediate model.""" @@ -361,8 +432,12 @@ def enrich_metadata(self, data, metadata): return metadata def __repr__(self): - reader_name = getattr(self.reader, '__name__', repr(self.reader)) - return f"{self.__class__.__name__}(reader={reader_name}, format='{self.format}')" + loader_name = getattr(self.loader, '__name__', repr(self.loader)) + if callable(self.writer): + writer_name = getattr(self.writer, '__name__', repr(self.writer)) + return f"{self.__class__.__name__}(loader={loader_name}, writer={writer_name})" + else: + return f"{self.__class__.__name__}(loader={loader_name}, writer='{self.writer}')" class SaveToFile: @@ -389,7 +464,7 @@ class SaveToFile: Example:: >>> transform = Compose([ - ... Translate(read_wcnf, "dimacs", open=dataset.open), + ... Translate(load_wcnf, "dimacs", open=dataset.open), ... SaveToFile("output/", extension=".cnf", write_metadata=True), ... ]) """ @@ -509,7 +584,7 @@ class Lambda: Example:: >>> transform = Compose([ - ... Parse(read_wcnf, open=dataset.open), + ... Load(load_wcnf, open=dataset.open), ... Lambda(lambda m: len(m.constraints), name="count_constraints"), ... ]) """ diff --git a/cpmpy/tools/dataset/xcsp3.py b/cpmpy/tools/dataset/xcsp3.py index e8ff5c68e..1620fb28d 100644 --- a/cpmpy/tools/dataset/xcsp3.py +++ b/cpmpy/tools/dataset/xcsp3.py @@ -56,9 +56,24 @@ def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transf @classmethod - def reader(file_path, open=open): - from cpmpy.tools.xcsp3.parser import read_xcsp3 - return read_xcsp3(file_path, open=open) + def reader(cls, file_path, open=open): + """ + Reader for XCSP3 dataset. + Parses a file path directly into a CPMpy model. + For backward compatibility. Consider using read() + load() instead. + """ + from cpmpy.tools.xcsp3.parser import load_xcsp3 + return load_xcsp3(file_path, open=open) + + @classmethod + def loader(cls, content: str): + """ + Loader for XCSP3 dataset. + Loads a CPMpy model from raw XCSP3 content string. + """ + from cpmpy.tools.xcsp3.parser import load_xcsp3 + # load_xcsp3 already supports raw strings + return load_xcsp3(content) def category(self) -> dict: return { @@ -143,7 +158,9 @@ def download(self): # Clean up the zip file target_download_path.unlink() - def open(self, instance: os.PathLike) -> io.TextIOBase: + + @classmethod + def open(cls, instance: os.PathLike) -> io.TextIOBase: return lzma.open(instance, mode='rt', encoding='utf-8') if str(instance).endswith(".lzma") else open(instance) diff --git a/cpmpy/tools/dimacs.py b/cpmpy/tools/dimacs.py index 42197cc22..534c5d134 100644 --- a/cpmpy/tools/dimacs.py +++ b/cpmpy/tools/dimacs.py @@ -79,9 +79,9 @@ def write_dimacs(model, fname=None, encoding="auto", header:Optional[str]="DIMAC return out -def read_dimacs(fname): +def load_dimacs(fname): """ - Read a CPMpy model from a DIMACS formatted file strictly following the specification: + Load a CPMpy model from a DIMACS formatted file strictly following the specification: https://web.archive.org/web/20190325181937/https://www.satcompetition.org/2009/format-benchmarks2009.html .. note:: @@ -129,6 +129,12 @@ def read_dimacs(fname): return m +# Backward compatibility alias +read_dimacs = load_dimacs + +# Backward compatibility alias +read_dimacs = load_dimacs + diff --git a/cpmpy/tools/io/__init__.py b/cpmpy/tools/io/__init__.py index 19ae87d65..5e396ac00 100644 --- a/cpmpy/tools/io/__init__.py +++ b/cpmpy/tools/io/__init__.py @@ -1,23 +1,23 @@ """ IO tools for CPMpy. -This module provides tools to read and write models in various formats. -Use the generic `read(..., format="...")` and `write(..., format="...")` functions to read and write +This module provides tools to load and write models in various formats. +Use the generic `load(..., format="...")` and `write(..., format="...")` functions to load and write models in one of the supported formats. Some formats can be auto-detected from the file extension, so only a file path is required as argument. """ from .writer import write, write_formats -from .reader import read, read_formats +from .reader import load, read, read_formats # read is alias for backward compatibility from .utils import get_extension, get_format # Problem datasets -from .jsplib import read_jsplib -from .nurserostering import read_nurserostering -from .rcpsp import read_rcpsp +from .jsplib import load_jsplib, read_jsplib # read_jsplib is alias for backward compatibility +from .nurserostering import load_nurserostering, read_nurserostering # read_nurserostering is alias +from .rcpsp import load_rcpsp, read_rcpsp # read_rcpsp is alias # Model datasets -from .opb import read_opb, write_opb -from .scip import read_scip, write_scip -from .wcnf import read_wcnf \ No newline at end of file +from .opb import load_opb, read_opb, write_opb # read_opb is alias +from .scip import load_scip, read_scip, write_scip # read_scip is alias +from .wcnf import load_wcnf, read_wcnf # read_wcnf is alias \ No newline at end of file diff --git a/cpmpy/tools/io/jsplib.py b/cpmpy/tools/io/jsplib.py index 7f1c13c1a..b7e029976 100644 --- a/cpmpy/tools/io/jsplib.py +++ b/cpmpy/tools/io/jsplib.py @@ -28,9 +28,9 @@ _std_open = open -def read_jsplib(jsp: Union[str, os.PathLike], open=open) -> cp.Model: +def load_jsplib(jsp: Union[str, os.PathLike], open=open) -> cp.Model: """ - Parser for JSPLib format. Reads in an instance and returns its matching CPMpy model. + Loader for JSPLib format. Loads an instance and returns its matching CPMpy model. Arguments: jsp (str or os.PathLike): @@ -148,5 +148,11 @@ def main(): else: print("No solution found.") +# Backward compatibility alias +read_jsplib = load_jsplib + +# Backward compatibility alias +read_jsplib = load_jsplib + if __name__ == "__main__": main() \ No newline at end of file diff --git a/cpmpy/tools/io/nurserostering.py b/cpmpy/tools/io/nurserostering.py index 89e292085..976a76ad3 100644 --- a/cpmpy/tools/io/nurserostering.py +++ b/cpmpy/tools/io/nurserostering.py @@ -32,9 +32,9 @@ _std_open = open -def read_nurserostering(instance: Union[str, os.PathLike], open=open) -> cp.Model: +def load_nurserostering(instance: Union[str, os.PathLike], open=open) -> cp.Model: """ - Parser for Nurse Rostering format. Reads in an instance and returns its matching CPMpy model. + Loader for Nurse Rostering format. Loads an instance and returns its matching CPMpy model. Arguments: instance (str or os.PathLike): @@ -69,6 +69,9 @@ def read_nurserostering(instance: Union[str, os.PathLike], open=open) -> cp.Mode if isinstance(instance, str) and not os.path.exists(instance) and os.path.exists(fname): os.unlink(fname) +# Backward compatibility alias +read_nurserostering = load_nurserostering + def main(): parser = argparse.ArgumentParser(description="Parse and solve a Nurse Rostering model using CPMpy") @@ -106,6 +109,9 @@ def main(): else: print("No solution found.") +# Backward compatibility alias +read_nurserostering = load_nurserostering + if __name__ == "__main__": main() diff --git a/cpmpy/tools/io/opb.py b/cpmpy/tools/io/opb.py index af2ad7c74..c0e34c152 100644 --- a/cpmpy/tools/io/opb.py +++ b/cpmpy/tools/io/opb.py @@ -120,9 +120,9 @@ def _parse_constraint(line, vars): ) _std_open = open -def read_opb(opb: Union[str, os.PathLike], open=open) -> cp.Model: +def load_opb(opb: Union[str, os.PathLike], open=open) -> cp.Model: """ - Parser for OPB (Pseudo-Boolean) format. Reads in an instance and returns its matching CPMpy model. + Loader for OPB (Pseudo-Boolean) format. Loads an instance and returns its matching CPMpy model. Based on PyPBLib's example parser: https://hardlog.udl.cat/static/doc/pypblib/html/library/index.html#example-from-opb-to-cnf-file @@ -484,5 +484,8 @@ def main(): else: print("No solution found.") +# Backward compatibility alias +read_opb = load_opb + if __name__ == "__main__": main() diff --git a/cpmpy/tools/io/rcpsp.py b/cpmpy/tools/io/rcpsp.py index 84aa29afa..18b2e39f5 100644 --- a/cpmpy/tools/io/rcpsp.py +++ b/cpmpy/tools/io/rcpsp.py @@ -27,9 +27,9 @@ _std_open = open -def read_rcpsp(rcpsp: Union[str, os.PathLike], open=open) -> cp.Model: +def load_rcpsp(rcpsp: Union[str, os.PathLike], open=open) -> cp.Model: """ - Parser for PSPLIB RCPSP format. Reads in an instance and returns its matching CPMpy model. + Loader for PSPLIB RCPSP format. Loads an instance and returns its matching CPMpy model. Arguments: rcpsp (str or os.PathLike): @@ -171,5 +171,11 @@ def main(): else: print("No solution found.") +# Backward compatibility alias +read_rcpsp = load_rcpsp + +# Backward compatibility alias +read_rcpsp = load_rcpsp + if __name__ == "__main__": main() \ No newline at end of file diff --git a/cpmpy/tools/io/reader.py b/cpmpy/tools/io/reader.py index a3203a7cb..d25df69a5 100644 --- a/cpmpy/tools/io/reader.py +++ b/cpmpy/tools/io/reader.py @@ -1,5 +1,5 @@ """ -CPMpy tools for reading models from files. +CPMpy tools for loading models from files. ================= List of functions @@ -8,30 +8,30 @@ .. autosummary:: :nosignatures: - read + load read_formats """ from typing import Callable, List, Optional import cpmpy as cp -from cpmpy.tools.dimacs import read_dimacs -from cpmpy.tools.io.scip import read_scip -from cpmpy.tools.io.wcnf import read_wcnf -from cpmpy.tools.io.opb import read_opb +from cpmpy.tools.dimacs import load_dimacs +from cpmpy.tools.io.scip import load_scip +from cpmpy.tools.io.wcnf import load_wcnf +from cpmpy.tools.io.opb import load_opb from cpmpy.tools.io.utils import get_format -# mapping format names to appropriate reader functions +# mapping format names to appropriate loader functions _reader_map = { - "mps": read_scip, - "lp": read_scip, - "cip": read_scip, - "fzn": read_scip, - "gms": read_scip, - "pip": read_scip, - "dimacs": read_dimacs, - "opb": read_opb, - "wcnf": read_wcnf, + "mps": load_scip, + "lp": load_scip, + "cip": load_scip, + "fzn": load_scip, + "gms": load_scip, + "pip": load_scip, + "dimacs": load_dimacs, + "opb": load_opb, + "wcnf": load_wcnf, } @@ -56,16 +56,16 @@ def _get_reader(format: str) -> Callable[[str], cp.Model]: def read_formats() -> List[str]: """ - List of supported read formats. + List of supported load formats. - Each can be used as the `format` argument to the `read` function. + Each can be used as the `format` argument to the `load` function. E.g.: .. code-block:: python - from cpmpy.tools.io import read - model = read(file_path, format="mps") - model = read(file_path, format="lp") + from cpmpy.tools.io import load + model = load(file_path, format="mps") + model = load(file_path, format="lp") """ return list(_reader_map.keys()) @@ -98,13 +98,13 @@ def _derive_format(file_path: str) -> str: raise ValueError(f"No file format provided and could not derive format from file path: {file_path}") -def read(file_path: str, format: Optional[str] = None) -> cp.Model: +def load(file_path: str, format: Optional[str] = None) -> cp.Model: """ - Read a model from a file. + Load a model from a file. Arguments: - file_path (str): The path to the file to read. - format (Optional[str]): The format of the file to read. If None, the format will be derived from the file path. + file_path (str): The path to the file to load. + format (Optional[str]): The format of the file to load. If None, the format will be derived from the file path. Raises: ValueError: If the format is not supported. @@ -117,4 +117,7 @@ def read(file_path: str, format: Optional[str] = None) -> cp.Model: format = _derive_format(file_path) reader = _get_reader(format) - return reader(file_path) \ No newline at end of file + return reader(file_path) + +# Backward compatibility alias +read = load \ No newline at end of file diff --git a/cpmpy/tools/io/scip.py b/cpmpy/tools/io/scip.py index 3f9d7086b..ccb22ab37 100644 --- a/cpmpy/tools/io/scip.py +++ b/cpmpy/tools/io/scip.py @@ -65,9 +65,9 @@ def _ignore_variable_name_check(): _std_open = open -def read_scip(fname: Union[str, os.PathLike], open=open, assume_integer:bool=False) -> cp.Model: +def load_scip(fname: Union[str, os.PathLike], open=open, assume_integer:bool=False) -> cp.Model: """ - Read a SCIP-compatible model from a file and return a CPMpy model. + Load a SCIP-compatible model from a file and return a CPMpy model. Arguments: fname: The path to the SCIP-compatible file to read. @@ -632,5 +632,11 @@ def main(): else: print("No solution found.") +# Backward compatibility alias +read_scip = load_scip + +# Backward compatibility alias +read_scip = load_scip + if __name__ == "__main__": main() diff --git a/cpmpy/tools/io/wcnf.py b/cpmpy/tools/io/wcnf.py index 5cea77608..46f140388 100644 --- a/cpmpy/tools/io/wcnf.py +++ b/cpmpy/tools/io/wcnf.py @@ -39,9 +39,9 @@ def _get_var(i, vars_dict): return vars_dict[i] _std_open = open -def read_wcnf(wcnf: Union[str, os.PathLike], open=open) -> cp.Model: +def load_wcnf(wcnf: Union[str, os.PathLike], open=open) -> cp.Model: """ - Parser for WCNF format. Reads in an instance and returns its matching CPMpy model. + Loader for WCNF format. Loads an instance and returns its matching CPMpy model. Arguments: wcnf (str or os.PathLike): @@ -96,6 +96,9 @@ def read_wcnf(wcnf: Union[str, os.PathLike], open=open) -> cp.Model: return model +# Backward compatibility alias +read_wcnf = load_wcnf + def main(): parser = argparse.ArgumentParser(description="Parse and solve a WCNF model using CPMpy") parser.add_argument("model", help="Path to a WCNF file (or raw WCNF string if --string is given)") @@ -132,5 +135,8 @@ def main(): else: print("No solution found.") +# Backward compatibility alias +read_wcnf = load_wcnf + if __name__ == "__main__": main() \ No newline at end of file diff --git a/cpmpy/tools/xcsp3/__init__.py b/cpmpy/tools/xcsp3/__init__.py index 9572943d8..b968d8657 100644 --- a/cpmpy/tools/xcsp3/__init__.py +++ b/cpmpy/tools/xcsp3/__init__.py @@ -24,4 +24,4 @@ from .dataset import XCSP3Dataset # for easier importing -from .parser import read_xcsp3 \ No newline at end of file +from .parser import load_xcsp3, read_xcsp3 # read_xcsp3 is alias for backward compatibility \ No newline at end of file diff --git a/cpmpy/tools/xcsp3/parser.py b/cpmpy/tools/xcsp3/parser.py index 761ef7caa..d27677cf0 100644 --- a/cpmpy/tools/xcsp3/parser.py +++ b/cpmpy/tools/xcsp3/parser.py @@ -75,9 +75,9 @@ def _load_xcsp3(parser: "ParserXCSP3") -> cp.Model: return model _std_open = open -def read_xcsp3(xcsp3: os.PathLike, open=open) -> cp.Model: +def load_xcsp3(xcsp3: os.PathLike, open=open) -> cp.Model: """ - Reads in an XCSP3 instance (.xml or .xml.lzma) and returns its matching CPMpy model. + Loads an XCSP3 instance (.xml or .xml.lzma) and returns its matching CPMpy model. Arguments: xcsp3 (str or os.PathLike): @@ -141,6 +141,9 @@ def main(): else: print("No solution found.") +# Backward compatibility alias +read_xcsp3 = load_xcsp3 + if __name__ == "__main__": main() \ No newline at end of file From 048ad1b071f60f0d10b726c0746362c7b90a8e9e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Tue, 17 Feb 2026 11:23:33 +0100 Subject: [PATCH 115/152] writer auto format detection --- cpmpy/tools/io/writer.py | 52 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/cpmpy/tools/io/writer.py b/cpmpy/tools/io/writer.py index 59c6029de..d6e18c441 100644 --- a/cpmpy/tools/io/writer.py +++ b/cpmpy/tools/io/writer.py @@ -24,6 +24,7 @@ from cpmpy.tools.dimacs import write_dimacs from cpmpy.tools.io.scip import write_scip from cpmpy.tools.io.opb import write_opb +from cpmpy.tools.io.utils import get_format # mapping format names to appropriate writer functions _writer_map = { @@ -110,8 +111,8 @@ def write_formats() -> List[str]: .. code-block:: python from cpmpy.tools.io import write, write_formats, get_extension - write(model, format=write_formats()[0]) - write(model, format=write_formats()[1], file_path=f"model.{get_extension(write_formats()[1])}") + write(model, format=write_formats()[0]) # Returns string + write(model, f"model.{get_extension(write_formats()[1])}") # Writes to file, format auto-detected """ return list(_writer_map.keys()) @@ -126,19 +127,62 @@ def _create_header(format: str) -> str: header += "-"*100 + "\n" return header -def write(model: cp.Model, format: str, file_path: Optional[str] = None, verbose: bool = False, header: Optional[str] = None, **kwargs) -> str: +def _derive_format(file_path: str) -> str: + """ + Derive the format of a file from its path. + + Arguments: + file_path (str): The path to the file to derive the format from. + + Raises: + ValueError: If the format could not be derived from the file path. + + Returns: + The name of the format. + + Example: + >>> _derive_format("output.mps") + "mps" + >>> _derive_format("output.lp.xz") + "lp" + """ + + # Iterate over the file path extensions in reverse order + for ext in file_path.split(".")[::-1]: + try: + return get_format(ext) + except (ValueError, KeyError): + continue + + raise ValueError(f"No file format provided and could not derive format from file path: {file_path}") + +def write(model: cp.Model, file_path: Optional[str] = None, format: Optional[str] = None, verbose: bool = False, header: Optional[str] = None, **kwargs) -> str: """ Write a model to a file. Arguments: model (cp.Model): The model to write. - format (str): The format to write the model in. file_path (Optional[str]): The path to the file to write the model to. If None, only a string containing the model will be returned. + format (Optional[str]): The format to write the model in. If None and file_path is provided, the format will be derived from the file path extension. verbose (bool): Whether to print verbose output. header (Optional[str]): The header to put at the top of the file. If None, a default header will be created. Pass an empty string to skip adding a header. **kwargs: Additional arguments to pass to the writer. + + Raises: + ValueError: If the format is not supported or could not be derived from the file path. + + Example: + >>> write(model, "output.opb") # Format auto-detected from .opb + >>> write(model, "output.txt", format="opb") # Format explicitly specified + >>> write(model, format="opb") # Returns string, format must be specified """ + # Derive format from file_path if not provided + if format is None: + if file_path is None: + raise ValueError("Either 'format' or 'file_path' must be provided") + format = _derive_format(file_path) + writer = _get_writer(format) kwargs["verbose"] = verbose From 85f2f98c33ec99f2d0649a1e10662cd24aea882f Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sat, 28 Feb 2026 13:46:41 +0100 Subject: [PATCH 116/152] Metadata collection --- cpmpy/tools/{dataset => datasets}/_base.py | 294 +++++++++++++++------ 1 file changed, 215 insertions(+), 79 deletions(-) rename cpmpy/tools/{dataset => datasets}/_base.py (73%) diff --git a/cpmpy/tools/dataset/_base.py b/cpmpy/tools/datasets/_base.py similarity index 73% rename from cpmpy/tools/dataset/_base.py rename to cpmpy/tools/datasets/_base.py index e17f9939c..fa2e8a298 100644 --- a/cpmpy/tools/dataset/_base.py +++ b/cpmpy/tools/datasets/_base.py @@ -13,10 +13,11 @@ import io import tempfile import warnings -from typing import Any, Optional, Tuple, List +from typing import Any, Optional, Tuple, List, Union from urllib.error import URLError from urllib.request import HTTPError, Request, urlopen -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed +import multiprocessing # tqdm as an optional dependency, provides prettier progress bars try: @@ -137,6 +138,90 @@ def extract_model_features(model) -> dict: return _extract_model_features(model) +# Global context for process-based metadata collection workers +_metadata_worker_context = {} + + +def _init_metadata_worker(context_dict, collect_metadata_func, reader_func, open_func): + """Initialize worker process with dataset context.""" + global _metadata_worker_context + _metadata_worker_context = context_dict.copy() + _metadata_worker_context['collect_instance_metadata'] = collect_metadata_func + _metadata_worker_context['reader'] = reader_func + _metadata_worker_context['open_func'] = open_func + + +def _collect_one_metadata_worker(file_path_str): + """Worker function for process-based metadata collection.""" + global _metadata_worker_context + file_path = pathlib.Path(file_path_str) + dataset_dir = pathlib.Path(_metadata_worker_context['dataset_dir']) + meta_path = dataset_dir / (file_path.name + _metadata_worker_context['metadata_extension']) + + # Collect instance metadata using the provided function + collect_metadata = _metadata_worker_context['collect_instance_metadata'] + try: + instance_meta = collect_metadata(str(file_path)) + except Exception as e: + instance_meta = {"_metadata_error": str(e)} + + # Separate portable from format-specific fields + portable = portable_instance_metadata(instance_meta) + format_specific = { + k: v for k, v in instance_meta.items() + if k not in portable and not k.startswith("_") + } + + # Derive instance name + stem = file_path.stem + for ext in (".xml", ".wcnf", ".opb"): + if stem.endswith(ext): + stem = stem[:len(stem) - len(ext)] + break + + # Build structured sidecar + sidecar = { + "dataset": _metadata_worker_context['dataset_metadata'], + "instance_name": stem, + "source_file": str(file_path.relative_to(dataset_dir)), + "category": _metadata_worker_context['category'], + "instance_metadata": portable, + "format_metadata": format_specific, + } + + if "_metadata_error" in instance_meta: + sidecar["_metadata_error"] = instance_meta["_metadata_error"] + + # Preserve or compute model features + model_features = None + if meta_path.exists(): + try: + with open(meta_path, "r") as f: + existing = json.load(f) + if "model_features" in existing: + model_features = existing["model_features"] + except (json.JSONDecodeError, IOError): + pass + + if model_features is None: + reader = _metadata_worker_context['reader'] + open_func = _metadata_worker_context['open_func'] + if not callable(reader): + raise TypeError( + f"Cannot extract model features for {file_path}: " + "no dataset reader configured." + ) + model = reader(str(file_path), open=open_func) + model_features = extract_model_features(model) + + sidecar["model_features"] = model_features + + with open(meta_path, "w") as f: + json.dump(sidecar, f, indent=2) + + return str(file_path) + + class _Dataset(ABC): """ Abstract base class for PyTorch-style datasets of benchmarking instances. @@ -166,6 +251,7 @@ def __init__( transform=None, target_transform=None, download: bool = False, extension:str=".txt", + metadata_workers: int = 1, **kwargs ): """ @@ -177,6 +263,7 @@ def __init__( target_transform (callable, optional): Optional transform applied to the metadata dictionary. download (bool): If True, downloads the dataset if it does not exist locally (default=False). extension (str): Extension of the instance files. + metadata_workers (int): Number of parallel workers for metadata collection during download (default: 1). Raises: ValueError: If the dataset directory does not exist and `download=False`, @@ -188,7 +275,7 @@ def __init__( self.target_transform = target_transform self.extension = extension if not self.origins: - from cpmpy.tools.dataset.config import get_origins + from cpmpy.tools.datasets.config import get_origins self.origins = get_origins(self.name) if not self.dataset_dir.exists(): @@ -196,7 +283,7 @@ def __init__( raise ValueError("Dataset not found. Please set download=True to download the dataset.") else: self.download() - self._collect_all_metadata() + self._collect_all_metadata(workers=metadata_workers) files = self._list_instances() print(f"Finished downloading {len(files)} instances") @@ -273,25 +360,36 @@ def collect_instance_metadata(self, file: pathlib.Path) -> dict: return {} @classmethod - def open(cls, instance) -> io.TextIOBase: + def open(cls, instance: os.PathLike) -> io.TextIOBase: """ How an instance file from the dataset should be opened. Especially usefull when files come compressed and won't work with python standard library's 'open', e.g. '.xz', '.lzma'. + + Arguments: + instance (os.PathLike): File path to the instance file. + + Returns: + io.TextIOBase: The opened file. """ return open(instance, "r") - def read(self, instance) -> str: + def read(self, instance: os.PathLike) -> str: """ Read raw file contents from an instance file. Handles decompression automatically via dataset.open(). This is the "reading" step: decompressing + reading raw file contents. + + Arguments: + instance (os.PathLike): File path to the instance file. + Returns: + str: The raw file contents. """ with self.open(instance) as f: return f.read() - def load(self, instance) -> cp.Model: + def load(self, instance: Union[str, os.PathLike]) -> cp.Model: """ Load a CPMpy model from an instance file. @@ -300,17 +398,30 @@ def load(self, instance) -> cp.Model: Loading always handles reading internally by calling `read()`. Arguments: - instance: File path to the instance file. + instance (str or os.PathLike): + - File path to the instance file + - OR a string containing the instance content directly Returns: cp.Model: The loaded CPMpy model. """ - # Step 1: Reading - use read() to decompress and read raw file contents - content = self.read(instance) - # Step 2: Loading - turn raw contents into CPMpy model + + # If instance is a path to a file -> open file + if isinstance(instance, (str, os.PathLike)) and os.path.exists(instance): + # Reading - use read() to decompress and read raw file contents + content = self.read(instance) + # If instance is a string containing a model -> use it directly + else: + content = instance + + # Loading - turn raw contents into CPMpy model return self.loader(content) + + + + # ---------------------------------------------------------------------------- # # Public interface # # ---------------------------------------------------------------------------- # @@ -348,8 +459,6 @@ def dataset_metadata(cls) -> dict: "url": cls.url, "license": cls.license, "citation": citations, - "domain": cls.domain, - "format": cls.format, } @@ -410,7 +519,7 @@ def _metadata_path(self, instance_path: pathlib.Path) -> pathlib.Path: """ return pathlib.Path(str(instance_path) + self.METADATA_EXTENSION) - def _collect_all_metadata(self, force=False): + def _collect_all_metadata(self, force=False, workers=1): """ Collect and store structured metadata sidecar files for all instances. @@ -426,6 +535,8 @@ def _collect_all_metadata(self, force=False): Arguments: force (bool): If True, re-collect instance metadata even if sidecar files already exist. + workers (int): Number of parallel workers for metadata collection. + Default is 1 (sequential). Use >1 for parallel processing. """ files = self._list_instances() @@ -439,72 +550,99 @@ def _collect_all_metadata(self, force=False): if not files_to_process: return - # Use tqdm for progress if available - if tqdm is not None: - file_iter = tqdm(files_to_process, desc="Collecting metadata", unit="file") - else: - file_iter = files_to_process - print(f"Collecting metadata for {len(files_to_process)} instances...") + # Process files sequentially or in parallel + if workers <= 1: + # Sequential processing + if tqdm is not None: + file_iter = tqdm(files_to_process, desc="Collecting metadata", unit="file") + else: + file_iter = files_to_process + print(f"Collecting metadata for {len(files_to_process)} instances...") - for file_path in file_iter: - meta_path = self._metadata_path(file_path) - try: - instance_meta = self.collect_instance_metadata(str(file_path)) - except Exception as e: - instance_meta = {"_metadata_error": str(e)} + for file_path in file_iter: + self._collect_one_metadata(file_path) + else: + # Parallel processing with ProcessPoolExecutor for CPU-bound work + print(f"Collecting metadata for {len(files_to_process)} instances using {workers} workers...") + + # Use ProcessPoolExecutor with fork start method (Linux) to allow bound methods + # On Linux, fork allows sharing the dataset instance, so bound methods work + ctx = multiprocessing.get_context('fork') + with ProcessPoolExecutor(max_workers=workers, mp_context=ctx) as executor: + futures = {executor.submit(self._collect_one_metadata, fp): fp for fp in files_to_process} + + if tqdm is not None: + iterator = tqdm(as_completed(futures), total=len(futures), desc="Collecting metadata", unit="file") + else: + iterator = as_completed(futures) + + for future in iterator: + try: + future.result() + except Exception as e: + fp = futures[future] + print(f"Error collecting metadata for {fp.name}: {e}") + + def _collect_one_metadata(self, file_path): + """Collect metadata for a single instance file.""" + meta_path = self._metadata_path(file_path) + try: + instance_meta = self.collect_instance_metadata(str(file_path)) + except Exception as e: + instance_meta = {"_metadata_error": str(e)} + + # Separate portable from format-specific fields + portable = portable_instance_metadata(instance_meta) + format_specific = { + k: v for k, v in instance_meta.items() + if k not in portable and not k.startswith("_") + } - # Separate portable from format-specific fields - portable = portable_instance_metadata(instance_meta) - format_specific = { - k: v for k, v in instance_meta.items() - if k not in portable and not k.startswith("_") - } + # Derive instance name (strip format-specific extensions) + stem = file_path.stem + for ext in (".xml", ".wcnf", ".opb"): + if stem.endswith(ext): + stem = stem[:len(stem) - len(ext)] + break + + # Build structured sidecar + sidecar = { + "dataset": self.dataset_metadata(), + "instance_name": stem, + "source_file": str(file_path.relative_to(self.dataset_dir)), + "category": self.category(), + "instance_metadata": portable, + "format_metadata": format_specific, + } - # Derive instance name (strip format-specific extensions) - stem = file_path.stem - for ext in (".xml", ".wcnf", ".opb"): - if stem.endswith(ext): - stem = stem[:len(stem) - len(ext)] - break - - # Build structured sidecar - sidecar = { - "dataset": self.dataset_metadata(), - "instance_name": stem, - "source_file": str(file_path.relative_to(self.dataset_dir)), - "category": self.category(), - "instance_metadata": portable, - "format_metadata": format_specific, - } + if "_metadata_error" in instance_meta: + sidecar["_metadata_error"] = instance_meta["_metadata_error"] - if "_metadata_error" in instance_meta: - sidecar["_metadata_error"] = instance_meta["_metadata_error"] - - # Preserve previously extracted model features if present. - # Otherwise, compute them from the parsed model when possible. - model_features = None - if meta_path.exists(): - try: - with open(meta_path, "r") as f: - existing = json.load(f) - if "model_features" in existing: - model_features = existing["model_features"] - except (json.JSONDecodeError, IOError): - pass - - if model_features is None: - if not callable(self.reader): - raise TypeError( - f"Cannot extract model features for {file_path}: " - "no dataset reader configured. If unexpected, please open an issue on GitHub." - ) - model = self.reader(str(file_path), open=self.open) - model_features = extract_model_features(model) - - sidecar["model_features"] = model_features + # Preserve previously extracted model features if present. + # Otherwise, compute them from the parsed model when possible. + model_features = None + if meta_path.exists(): + try: + with open(meta_path, "r") as f: + existing = json.load(f) + if "model_features" in existing: + model_features = existing["model_features"] + except (json.JSONDecodeError, IOError): + pass + + if model_features is None: + if not callable(self.reader): + raise TypeError( + f"Cannot extract model features for {file_path}: " + "no dataset reader configured. If unexpected, please open an issue on GitHub." + ) + model = self.reader(str(file_path), open=self.open) + model_features = extract_model_features(model) + + sidecar["model_features"] = model_features - with open(meta_path, "w") as f: - json.dump(sidecar, f, indent=2) + with open(meta_path, "w") as f: + json.dump(sidecar, f, indent=2) # ----------------------------- Download methods ----------------------------- # @@ -593,8 +731,6 @@ def _download_file(url: str, target: str, destination: Optional[str] = None, if destination is None: temp_destination.close() - _Dataset._download_sequential(url + target, destination, total_size, desc, chunk_size) - return pathlib.Path(destination) except (HTTPError, URLError) as e: @@ -691,7 +827,7 @@ def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc if tqdm is not None: if total_size > 0: with tqdm(total=total_size, unit='B', unit_scale=True, - unit_divisor=1024, desc=desc, file=sys.stdout, + unit_divisor=1024, desc=f"Downloading {desc}", file=sys.stdout, miniters=1, dynamic_ncols=True, ascii=False) as pbar: with open(filepath, 'wb') as f: while True: @@ -703,7 +839,7 @@ def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc else: # Unknown size with tqdm(unit='B', unit_scale=True, unit_divisor=1024, - desc=desc, file=sys.stdout, miniters=1, + desc=f"Downloading {desc}", file=sys.stdout, miniters=1, dynamic_ncols=True, ascii=False) as pbar: with open(filepath, 'wb') as f: while True: From 9bba9b4962e028d23735c7093ba938e8957d2064 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sat, 28 Feb 2026 13:47:43 +0100 Subject: [PATCH 117/152] Move to datasets --- cpmpy/tools/{dataset => datasets}/__init__.py | 0 cpmpy/tools/{dataset => datasets}/config.py | 0 cpmpy/tools/{dataset => datasets}/jsplib.py | 7 ++++--- cpmpy/tools/{dataset => datasets}/miplib.py | 8 +++++--- cpmpy/tools/{dataset => datasets}/mse.py | 8 +++++--- cpmpy/tools/{dataset => datasets}/nurserostering.py | 7 ++++--- cpmpy/tools/{dataset => datasets}/opb.py | 8 +++++--- cpmpy/tools/{dataset => datasets}/psplib.py | 7 ++++--- cpmpy/tools/{dataset => datasets}/transforms.py | 0 cpmpy/tools/{dataset => datasets}/xcsp3.py | 7 ++++--- cpmpy/tools/io/nurserostering.py | 2 +- 11 files changed, 32 insertions(+), 22 deletions(-) rename cpmpy/tools/{dataset => datasets}/__init__.py (100%) rename cpmpy/tools/{dataset => datasets}/config.py (100%) rename cpmpy/tools/{dataset => datasets}/jsplib.py (97%) rename cpmpy/tools/{dataset => datasets}/miplib.py (96%) rename cpmpy/tools/{dataset => datasets}/mse.py (96%) rename cpmpy/tools/{dataset => datasets}/nurserostering.py (99%) rename cpmpy/tools/{dataset => datasets}/opb.py (97%) rename cpmpy/tools/{dataset => datasets}/psplib.py (97%) rename cpmpy/tools/{dataset => datasets}/transforms.py (100%) rename cpmpy/tools/{dataset => datasets}/xcsp3.py (97%) diff --git a/cpmpy/tools/dataset/__init__.py b/cpmpy/tools/datasets/__init__.py similarity index 100% rename from cpmpy/tools/dataset/__init__.py rename to cpmpy/tools/datasets/__init__.py diff --git a/cpmpy/tools/dataset/config.py b/cpmpy/tools/datasets/config.py similarity index 100% rename from cpmpy/tools/dataset/config.py rename to cpmpy/tools/datasets/config.py diff --git a/cpmpy/tools/dataset/jsplib.py b/cpmpy/tools/datasets/jsplib.py similarity index 97% rename from cpmpy/tools/dataset/jsplib.py rename to cpmpy/tools/datasets/jsplib.py index d95aeddf1..1300ce2cb 100644 --- a/cpmpy/tools/dataset/jsplib.py +++ b/cpmpy/tools/datasets/jsplib.py @@ -14,7 +14,7 @@ import numpy as np import cpmpy as cp -from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.datasets._base import _Dataset class JSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible @@ -38,7 +38,7 @@ class JSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible "E. Taillard. 'Benchmarks for basic scheduling problems', European Journal of Operational Research, Vol. 64, Issue 2, pp. 278-285, 1993.", ] - def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False): + def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False, metadata_workers: int = 1): """ Initialize the JSPLib Dataset. @@ -58,7 +58,8 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, - download=download, extension="" + download=download, extension="", + metadata_workers=metadata_workers ) diff --git a/cpmpy/tools/dataset/miplib.py b/cpmpy/tools/datasets/miplib.py similarity index 96% rename from cpmpy/tools/dataset/miplib.py rename to cpmpy/tools/datasets/miplib.py index 40bc676ab..1d2af9c48 100644 --- a/cpmpy/tools/dataset/miplib.py +++ b/cpmpy/tools/datasets/miplib.py @@ -11,7 +11,7 @@ import pathlib import io -from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.datasets._base import _Dataset class MIPLibDataset(_Dataset): # torch.utils.data.Dataset compatible @@ -35,7 +35,8 @@ def __init__( root: str = ".", year: int = 2024, track: str = "exact-unweighted", transform=None, target_transform=None, - download: bool = False + download: bool = False, + metadata_workers: int = 1 ): """ Constructor for a dataset object of the MIPLib competition. @@ -62,7 +63,8 @@ def __init__( super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, - download=download, extension=".mps.gz" + download=download, extension=".mps.gz", + metadata_workers=metadata_workers ) @staticmethod diff --git a/cpmpy/tools/dataset/mse.py b/cpmpy/tools/datasets/mse.py similarity index 96% rename from cpmpy/tools/dataset/mse.py rename to cpmpy/tools/datasets/mse.py index 04f4f89f1..f1b5eb5e5 100644 --- a/cpmpy/tools/dataset/mse.py +++ b/cpmpy/tools/datasets/mse.py @@ -11,7 +11,7 @@ import pathlib import io -from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.datasets._base import _Dataset class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible @@ -37,7 +37,8 @@ def __init__( root: str = ".", year: int = 2024, track: str = "exact-unweighted", transform=None, target_transform=None, - download: bool = False + download: bool = False, + metadata_workers: int = 1 ): """ Constructor for a dataset object of the MSE competition. @@ -71,7 +72,8 @@ def __init__( super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, - download=download, extension=".wcnf.xz" + download=download, extension=".wcnf.xz", + metadata_workers=metadata_workers ) diff --git a/cpmpy/tools/dataset/nurserostering.py b/cpmpy/tools/datasets/nurserostering.py similarity index 99% rename from cpmpy/tools/dataset/nurserostering.py rename to cpmpy/tools/datasets/nurserostering.py index 015db79b3..6e7b0ab38 100644 --- a/cpmpy/tools/dataset/nurserostering.py +++ b/cpmpy/tools/datasets/nurserostering.py @@ -14,7 +14,7 @@ import io import cpmpy as cp -from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.datasets._base import _Dataset # Optional dependencies try: @@ -48,7 +48,7 @@ class NurseRosteringDataset(_Dataset): # torch.utils.data.Dataset compatible "Rahimian, E., Akartunali, K., and Levine, J. A hybrid integer programming and variable neighbourhood search algorithm to solve nurse rostering problems. European Journal of Operational Research, 2017. 258(2): p. 411-423.", ] - def __init__(self, root: str = ".", transform=None, target_transform=None, download:bool=False, sort_key=None): + def __init__(self, root: str = ".", transform=None, target_transform=None, download:bool=False, sort_key=None, metadata_workers: int = 1): """ Initialize the Nurserostering Dataset. @@ -70,7 +70,8 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, - download=download, extension=".txt" + download=download, extension=".txt", + metadata_workers=metadata_workers ) @staticmethod diff --git a/cpmpy/tools/dataset/opb.py b/cpmpy/tools/datasets/opb.py similarity index 97% rename from cpmpy/tools/dataset/opb.py rename to cpmpy/tools/datasets/opb.py index 46ecaa932..66d831b3a 100644 --- a/cpmpy/tools/dataset/opb.py +++ b/cpmpy/tools/datasets/opb.py @@ -11,7 +11,7 @@ import tarfile import io -from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.datasets._base import _Dataset class OPBDataset(_Dataset): @@ -37,7 +37,8 @@ def __init__( year: int = 2024, track: str = "OPT-LIN", competition: bool = True, transform=None, target_transform=None, - download: bool = False + download: bool = False, + metadata_workers: int = 1 ): """ Constructor for a dataset object of the PB competition. @@ -73,7 +74,8 @@ def __init__( super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, - download=download, extension=".opb.xz" + download=download, extension=".opb.xz", + metadata_workers=metadata_workers ) diff --git a/cpmpy/tools/dataset/psplib.py b/cpmpy/tools/datasets/psplib.py similarity index 97% rename from cpmpy/tools/dataset/psplib.py rename to cpmpy/tools/datasets/psplib.py index cd9c4e1c1..1940dba2f 100644 --- a/cpmpy/tools/dataset/psplib.py +++ b/cpmpy/tools/datasets/psplib.py @@ -9,7 +9,7 @@ import io import zipfile -from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.datasets._base import _Dataset class PSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible """ @@ -23,7 +23,7 @@ class PSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible url = "https://www.om-db.wi.tum.de/psplib/main.html" - def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False): + def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False, metadata_workers: int = 1): """ Constructor for a dataset object for PSPlib. @@ -60,7 +60,8 @@ def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, - download=download, extension=f".{self.family_codes[self.variant]}" + download=download, extension=f".{self.family_codes[self.variant]}", + metadata_workers=metadata_workers ) @staticmethod diff --git a/cpmpy/tools/dataset/transforms.py b/cpmpy/tools/datasets/transforms.py similarity index 100% rename from cpmpy/tools/dataset/transforms.py rename to cpmpy/tools/datasets/transforms.py diff --git a/cpmpy/tools/dataset/xcsp3.py b/cpmpy/tools/datasets/xcsp3.py similarity index 97% rename from cpmpy/tools/dataset/xcsp3.py rename to cpmpy/tools/datasets/xcsp3.py index 1620fb28d..8eb4a0487 100644 --- a/cpmpy/tools/dataset/xcsp3.py +++ b/cpmpy/tools/datasets/xcsp3.py @@ -10,7 +10,7 @@ import pathlib import io -from cpmpy.tools.dataset._base import _Dataset +from cpmpy.tools.datasets._base import _Dataset class XCSP3Dataset(_Dataset): # torch.utils.data.Dataset compatible @@ -32,7 +32,7 @@ class XCSP3Dataset(_Dataset): # torch.utils.data.Dataset compatible url = "https://xcsp.org/instances/" - def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transform=None, target_transform=None, download: bool = False): + def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transform=None, target_transform=None, download: bool = False, metadata_workers: int = 1): """ Initialize the XCSP3 Dataset. """ @@ -51,7 +51,8 @@ def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transf super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, - download=download, extension=".xml.lzma" + download=download, extension=".xml.lzma", + metadata_workers=metadata_workers ) diff --git a/cpmpy/tools/io/nurserostering.py b/cpmpy/tools/io/nurserostering.py index 976a76ad3..111625a41 100644 --- a/cpmpy/tools/io/nurserostering.py +++ b/cpmpy/tools/io/nurserostering.py @@ -25,7 +25,7 @@ import cpmpy as cp from typing import Union -from cpmpy.tools.dataset.nurserostering import ( +from cpmpy.tools.datasets.nurserostering import ( parse_scheduling_period, nurserostering_model ) From e89601e08fe83372f82ff60565c5a552eb4ca8b2 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sat, 28 Feb 2026 13:47:56 +0100 Subject: [PATCH 118/152] xcsp3 io tool --- cpmpy/tools/io/__init__.py | 3 ++- cpmpy/tools/io/xcsp3.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 cpmpy/tools/io/xcsp3.py diff --git a/cpmpy/tools/io/__init__.py b/cpmpy/tools/io/__init__.py index 5e396ac00..6c7af0acb 100644 --- a/cpmpy/tools/io/__init__.py +++ b/cpmpy/tools/io/__init__.py @@ -20,4 +20,5 @@ # Model datasets from .opb import load_opb, read_opb, write_opb # read_opb is alias from .scip import load_scip, read_scip, write_scip # read_scip is alias -from .wcnf import load_wcnf, read_wcnf # read_wcnf is alias \ No newline at end of file +from .wcnf import load_wcnf, read_wcnf # read_wcnf is alias +from .xcsp3 import load_xcsp3 \ No newline at end of file diff --git a/cpmpy/tools/io/xcsp3.py b/cpmpy/tools/io/xcsp3.py new file mode 100644 index 000000000..7912bbfb1 --- /dev/null +++ b/cpmpy/tools/io/xcsp3.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## xcsp3.py +## +""" +XCSP3 parser. + +================= +List of functions +================= + +.. autosummary:: + :nosignatures: + + load_xcsp3 +""" + +import os +from typing import Union + + +import cpmpy as cp + +from cpmpy.tools.xcsp3.parser import load_xcsp3 as load_xcsp3_parser +_std_open = open +def load_xcsp3(xcsp3: Union[str, os.PathLike], open=open) -> cp.Model: + return load_xcsp3_parser(xcsp3, open=open) + From 59623afa68a165cb756cd743588af4524a8874e1 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sat, 28 Feb 2026 13:48:16 +0100 Subject: [PATCH 119/152] io tool all pip install --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index fabbb7c60..9959c5cde 100644 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ def get_version(rel_path): "io.wcnf": [], # No external dependencies "io.xcsp3": ["pycsp3"], } +format_dependencies["io.all"] = list({pkg for group in format_dependencies.values() for pkg in group}) setup( name='cpmpy', From 6e803ce724df9d324648cbad214d7d22ca3c6d30 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sun, 1 Mar 2026 15:10:57 +0100 Subject: [PATCH 120/152] Dataset class hierarchy --- cpmpy/tools/datasets/__init__.py | 4 +- cpmpy/tools/datasets/_base.py | 149 +++++++++++++++++++++---- cpmpy/tools/datasets/jsplib.py | 4 +- cpmpy/tools/datasets/miplib.py | 4 +- cpmpy/tools/datasets/mse.py | 11 +- cpmpy/tools/datasets/nurserostering.py | 4 +- cpmpy/tools/datasets/opb.py | 4 +- cpmpy/tools/datasets/psplib.py | 4 +- cpmpy/tools/datasets/xcsp3.py | 5 +- 9 files changed, 149 insertions(+), 40 deletions(-) diff --git a/cpmpy/tools/datasets/__init__.py b/cpmpy/tools/datasets/__init__.py index cc4ca2857..01c75630f 100644 --- a/cpmpy/tools/datasets/__init__.py +++ b/cpmpy/tools/datasets/__init__.py @@ -1,11 +1,11 @@ -from ._base import extract_model_features, portable_instance_metadata +from ._base import extract_model_features, portable_instance_metadata, FileDataset from .miplib import MIPLibDataset from .jsplib import JSPLibDataset from .psplib import PSPLibDataset from .nurserostering import NurseRosteringDataset from .xcsp3 import XCSP3Dataset from .opb import OPBDataset -from .mse import MSEDataset +from .mse import MaxSATEvalDataset from .transforms import Compose, Open, Load, Serialize, Translate, SaveToFile, Lambda, extract_format_metadata # Backward compatibility alias Parse = Load diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/datasets/_base.py index fa2e8a298..fac7d6216 100644 --- a/cpmpy/tools/datasets/_base.py +++ b/cpmpy/tools/datasets/_base.py @@ -3,7 +3,13 @@ This module defines the abstract `_Dataset` class, which serves as the foundation for loading and managing benchmark instance collections in CPMpy-based experiments. -It standardizes how datasets are stored, accessed, and optionally transformed. +It standardizes how datasets are downloaded, stored, accessed, and optionally transformed. + +It provides a Pytorch compatible interface (constructor arguments like "transform" and the +methods __len__ and __getitem__ for iterating over the dataset). + +Additionaly, it provides a collection of methods and helper functions to adapt the dataset +to the specific usecase requirements of constraint optimisation benchmarks. """ from abc import ABC, abstractmethod @@ -13,12 +19,14 @@ import io import tempfile import warnings -from typing import Any, Optional, Tuple, List, Union +from typing import Any, Iterator, Optional, Tuple, List, Union, Callable from urllib.error import URLError from urllib.request import HTTPError, Request, urlopen from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed import multiprocessing +from altair.utils.schemapi import _passthrough + # tqdm as an optional dependency, provides prettier progress bars try: from tqdm import tqdm @@ -27,13 +35,16 @@ import cpmpy as cp -# Fields produced by extract_model_features() — not portable across format translations +# TODO: move elsewhere? +# Fields produced by extract_model_features() (after loading into a CPMpy model) +# - not portable across format translations _MODEL_FEATURE_FIELDS = frozenset({ "num_variables", "num_bool_variables", "num_int_variables", "num_constraints", "constraint_types", "has_objective", "objective_type", "domain_size_min", "domain_size_max", "domain_size_mean", }) +# TODO: move elsewhere? # Prefixes for format-specific metadata fields (not portable across translations) _FORMAT_SPECIFIC_PREFIXES = ("opb_", "wcnf_", "mps_", "xcsp_", "dimacs_") @@ -41,6 +52,8 @@ def _format_bytes(bytes_num): """ Format bytes into human-readable string (e.g., KB, MB, GB). + + Used to display download progress. """ for unit in ['bytes', 'KB', 'MB', 'GB', 'TB']: if bytes_num < 1024.0: @@ -49,7 +62,8 @@ def _format_bytes(bytes_num): def portable_instance_metadata(metadata: dict) -> dict: - """Filter sidecar metadata to only portable, domain-specific fields. + """ + Filter sidecar metadata to only portable, domain-specific fields. Strips model features (num_variables, constraint_types, ...), format-specific fields (opb_*, wcnf_*, mps_*, ...), and internal @@ -222,12 +236,81 @@ def _collect_one_metadata_worker(file_path_str): return str(file_path) -class _Dataset(ABC): +""" +dataset.map(transform) +dataset.filter(predicate) +dataset.shuffle(seed) +dataset.split(ratio) +""" + +class Dataset(ABC): + """ + Abstract base class for datasets. + + Each instance in a dataset is characterised by a (x, y) pair of: + x: instance reference (e.g., file path, database key, generated seed, ...) + y: metadata (solution, features, origin, etc.) + """ + + def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None): + """ + Arguments: + transform (callable, optional): Optional transform applied to the instance reference. + target_transform (callable, optional): Optional transform applied to the metadata. + """ + self.transform = transform + self.target_transform = target_transform + +class IndexedDataset(Dataset): + """ + Abstract base class for indexed datasets. + """ + + @abstractmethod + def __len__(self) -> int: + """ + Return the total number of instances. + """ + pass + + @abstractmethod + def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Return the instance and metadata at the given index. + + Returns: + x: instance reference (e.g., file path, database key, generated seed, ...) + y: metadata (solution, features, origin, etc.) + """ + pass + + def __iter__(self): + for i in range(len(self)): + yield self[i] + +class IterableDataset(Dataset): + """ + Abstract base class for iterable datasets. + """ + + @abstractmethod + def __iter__(self) -> Iterator[Tuple[Any, Any]]: + """ + Return an iterator over the dataset. + + Returns: + Iterator[Tuple[Any, Any]]: Iterator over the dataset, yielding (x, y) pairs of: + x: instance reference (e.g., file path, database key, generated seed, ...) + y: metadata (solution, features, origin, etc.) + """ + pass + +class FileDataset(IndexedDataset): """ - Abstract base class for PyTorch-style datasets of benchmarking instances. + Abstract base class for PyTorch-style datasets of CO benchmarking instances. - The `_Dataset` class provides a standardized interface for downloading and - accessing benchmark instances. This class should not be used on its own. + The `FileDataset` class provides a standardized interface for downloading and + accessing file-backed benchmark instances. This class should not be used on its own. Instead have a look at one of the concrete subclasses, providing access to well-known datasets from the community. """ @@ -241,16 +324,17 @@ class _Dataset(ABC): license = "" citation: List[str] = [] + # TODO: remove for now? # Multiple download origins (override in subclasses or via config) # Origins are tried in order, falling back to original url if all fail origins: List[str] = [] # List of URL bases to try before falling back to original url def __init__( self, - dataset_dir: str = ".", - transform=None, target_transform=None, + root: str = ".", + transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False, - extension:str=".txt", + extension: str = ".txt", metadata_workers: int = 1, **kwargs ): @@ -258,11 +342,11 @@ def __init__( Constructor for the _Dataset base class. Arguments: - dataset_dir (str): Path to the dataset directory. + root (str): Path to the dataset directory. transform (callable, optional): Optional transform applied to the instance file path. target_transform (callable, optional): Optional transform applied to the metadata dictionary. download (bool): If True, downloads the dataset if it does not exist locally (default=False). - extension (str): Extension of the instance files. + extension (str): Extension of the instance files. Used to filter instance files from the dataset directory. metadata_workers (int): Number of parallel workers for metadata collection during download (default: 1). Raises: @@ -270,10 +354,10 @@ def __init__( or if the requested year/track combination is not available. ValueError: If the dataset directory does not contain any instance files. """ - self.dataset_dir = pathlib.Path(dataset_dir) - self.transform = transform - self.target_transform = target_transform + + self.dataset_dir = pathlib.Path(root) self.extension = extension + if not self.origins: from cpmpy.tools.datasets.config import get_origins self.origins = get_origins(self.name) @@ -291,6 +375,7 @@ def __init__( if len(files) == 0: raise ValueError(f"Cannot find any instances inside dataset {self.dataset_dir}. Is it a valid dataset? If so, please report on GitHub.") + super().__init__(transform=transform, target_transform=target_transform) # ---------------------------------------------------------------------------- # # Methods to implement in subclasses: # @@ -668,7 +753,7 @@ def _try_origin(base_url: str, target: str, destination: str, desc: str, chunk_s with urlopen(req) as response: total_size = int(response.headers.get('Content-Length', 0)) - _Dataset._download_sequential(full_url, destination, total_size, desc, chunk_size) + FileDataset._download_sequential(full_url, destination, total_size, desc, chunk_size) return pathlib.Path(destination) except (HTTPError, URLError): return None @@ -714,7 +799,7 @@ def _download_file(url: str, target: str, destination: Optional[str] = None, # Try custom origins first if provided if origins: for origin_url in origins: - result = _Dataset._try_origin(origin_url, target, destination, desc, chunk_size) + result = FileDataset._try_origin(origin_url, target, destination, desc, chunk_size) if result is not None: return result @@ -726,7 +811,7 @@ def _download_file(url: str, target: str, destination: Optional[str] = None, # Convert destination to Path for _download_sequential download_path = pathlib.Path(destination) if destination is not None else pathlib.Path(temp_destination.name) - _Dataset._download_sequential(url + target, download_path, total_size, desc, chunk_size) + FileDataset._download_sequential(url + target, download_path, total_size, desc, chunk_size) if destination is None: temp_destination.close() @@ -772,7 +857,7 @@ def download_one(url_suffix: str, target: str) -> Tuple[Optional[pathlib.Path], # Try custom origins first if origins: for origin_url in origins: - result = _Dataset._try_origin(origin_url, url_suffix + target, dest_path, desc, chunk_size) + result = FileDataset._try_origin(origin_url, url_suffix + target, dest_path, desc, chunk_size) if result is not None: return result, None @@ -783,7 +868,7 @@ def download_one(url_suffix: str, target: str) -> Tuple[Optional[pathlib.Path], with urlopen(req) as response: total_size = int(response.headers.get('Content-Length', 0)) - _Dataset._download_sequential(full_url, dest_path, total_size, desc, chunk_size) + FileDataset._download_sequential(full_url, dest_path, total_size, desc, chunk_size) return pathlib.Path(dest_path), None except Exception as e: return None, str(e) @@ -866,3 +951,25 @@ def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc sys.stdout.flush() sys.stdout.write("\n") sys.stdout.flush() + + + +class URLDataset(IndexedDataset): + """ + Abstract base class for URL-backed datasets. + + Each instance reference is a URL. + """ + pass + +class StreamingDataset(IterableDataset): + """ + Abstract base class for streaming datasets. + """ + pass + +class GeneratedDataset(IterableDataset): + """ + Abstract base class for generated datasets. + """ + pass \ No newline at end of file diff --git a/cpmpy/tools/datasets/jsplib.py b/cpmpy/tools/datasets/jsplib.py index 1300ce2cb..365151876 100644 --- a/cpmpy/tools/datasets/jsplib.py +++ b/cpmpy/tools/datasets/jsplib.py @@ -14,10 +14,10 @@ import numpy as np import cpmpy as cp -from cpmpy.tools.datasets._base import _Dataset +from cpmpy.tools.datasets._base import FileDataset -class JSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible +class JSPLibDataset(FileDataset): # torch.utils.data.Dataset compatible """ JSP Dataset in a PyTorch compatible format. diff --git a/cpmpy/tools/datasets/miplib.py b/cpmpy/tools/datasets/miplib.py index 1d2af9c48..8848522fa 100644 --- a/cpmpy/tools/datasets/miplib.py +++ b/cpmpy/tools/datasets/miplib.py @@ -11,10 +11,10 @@ import pathlib import io -from cpmpy.tools.datasets._base import _Dataset +from cpmpy.tools.datasets._base import FileDataset -class MIPLibDataset(_Dataset): # torch.utils.data.Dataset compatible +class MIPLibDataset(FileDataset): # torch.utils.data.Dataset compatible """ MIPLib Dataset in a PyTorch compatible format. diff --git a/cpmpy/tools/datasets/mse.py b/cpmpy/tools/datasets/mse.py index f1b5eb5e5..0f5e2aa7f 100644 --- a/cpmpy/tools/datasets/mse.py +++ b/cpmpy/tools/datasets/mse.py @@ -11,13 +11,14 @@ import pathlib import io -from cpmpy.tools.datasets._base import _Dataset +import cpmpy as cp +from cpmpy.tools.datasets._base import FileDataset -class MSEDataset(_Dataset): # torch.utils.data.Dataset compatible +class MaxSATEvalDataset(FileDataset): # torch.utils.data.Dataset compatible """ - MaxSAT Evaluation (MSE) benchmark dataset. + MaxSAT Evaluation benchmark dataset. Provides access to benchmark instances from the MaxSAT Evaluation competitions. Instances are grouped by `year` and `track` (e.g., @@ -41,7 +42,7 @@ def __init__( metadata_workers: int = 1 ): """ - Constructor for a dataset object of the MSE competition. + Constructor for a dataset object of the MaxSAT Evaluation competition. Arguments: root (str): Root directory where datasets are stored or will be downloaded to (default="."). @@ -167,6 +168,6 @@ def open(self, instance: os.PathLike) -> io.TextIOBase: if __name__ == "__main__": - dataset = MSEDataset(year=2024, track="exact-weighted", download=True) + dataset = MaxSATEvalDataset(year=2024, track="exact-weighted", download=True) print("Dataset size:", len(dataset)) print("Instance 0:", dataset[0]) diff --git a/cpmpy/tools/datasets/nurserostering.py b/cpmpy/tools/datasets/nurserostering.py index 6e7b0ab38..9f900ed12 100644 --- a/cpmpy/tools/datasets/nurserostering.py +++ b/cpmpy/tools/datasets/nurserostering.py @@ -14,7 +14,7 @@ import io import cpmpy as cp -from cpmpy.tools.datasets._base import _Dataset +from cpmpy.tools.datasets._base import FileDataset # Optional dependencies try: @@ -30,7 +30,7 @@ _HAS_FAKER = False -class NurseRosteringDataset(_Dataset): # torch.utils.data.Dataset compatible +class NurseRosteringDataset(FileDataset): # torch.utils.data.Dataset compatible """ Nurserostering Dataset in a PyTorch compatible format. diff --git a/cpmpy/tools/datasets/opb.py b/cpmpy/tools/datasets/opb.py index 66d831b3a..4df23d6cc 100644 --- a/cpmpy/tools/datasets/opb.py +++ b/cpmpy/tools/datasets/opb.py @@ -11,10 +11,10 @@ import tarfile import io -from cpmpy.tools.datasets._base import _Dataset +from cpmpy.tools.datasets._base import FileDataset -class OPBDataset(_Dataset): +class OPBDataset(FileDataset): """ Pseudo Boolean Competition (PB) benchmark dataset. diff --git a/cpmpy/tools/datasets/psplib.py b/cpmpy/tools/datasets/psplib.py index 1940dba2f..e2fec4496 100644 --- a/cpmpy/tools/datasets/psplib.py +++ b/cpmpy/tools/datasets/psplib.py @@ -9,9 +9,9 @@ import io import zipfile -from cpmpy.tools.datasets._base import _Dataset +from cpmpy.tools.datasets._base import FileDataset -class PSPLibDataset(_Dataset): # torch.utils.data.Dataset compatible +class PSPLibDataset(FileDataset): # torch.utils.data.Dataset compatible """ PSPlib Dataset in a PyTorch compatible format. diff --git a/cpmpy/tools/datasets/xcsp3.py b/cpmpy/tools/datasets/xcsp3.py index 8eb4a0487..461bb83b2 100644 --- a/cpmpy/tools/datasets/xcsp3.py +++ b/cpmpy/tools/datasets/xcsp3.py @@ -10,10 +10,11 @@ import pathlib import io -from cpmpy.tools.datasets._base import _Dataset +import cpmpy as cp +from cpmpy.tools.datasets._base import FileDataset -class XCSP3Dataset(_Dataset): # torch.utils.data.Dataset compatible +class XCSP3Dataset(FileDataset): # torch.utils.data.Dataset compatible """ XCSP3 Dataset in a PyTorch compatible format. From 9e0bbdc1e9cee86ef5d8aaa7e02a50832adc4f14 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sun, 1 Mar 2026 15:13:55 +0100 Subject: [PATCH 121/152] Dataset metadata properties --- cpmpy/tools/datasets/_base.py | 46 ++++++++++++++++++++++++++--------- cpmpy/tools/datasets/mse.py | 34 ++++++++++++++++++++------ 2 files changed, 60 insertions(+), 20 deletions(-) diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/datasets/_base.py index fac7d6216..d9a29a198 100644 --- a/cpmpy/tools/datasets/_base.py +++ b/cpmpy/tools/datasets/_base.py @@ -318,20 +318,35 @@ class FileDataset(IndexedDataset): # Extension for metadata sidecar files METADATA_EXTENSION = ".meta.json" - # Dataset-level metadata (override in subclasses) - description = "" - url = "" - license = "" - citation: List[str] = [] + # -------------- Dataset-level metadata (override in subclasses) ------------- # + @property + @abstractmethod + def name(self) -> str: pass + + @property + @abstractmethod + def description(self) -> str: pass + + @property + @abstractmethod + def url(self) -> str: pass + + @property + def citation(self) -> List[str]: + return [] + # TODO: remove for now? # Multiple download origins (override in subclasses or via config) # Origins are tried in order, falling back to original url if all fail origins: List[str] = [] # List of URL bases to try before falling back to original url + # ---------------------------------------------------------------------------- # + + def __init__( self, - root: str = ".", + dataset_dir: str = ".", transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False, extension: str = ".txt", @@ -342,7 +357,7 @@ def __init__( Constructor for the _Dataset base class. Arguments: - root (str): Path to the dataset directory. + dataset_dir (str): Path to the dataset directory. transform (callable, optional): Optional transform applied to the instance file path. target_transform (callable, optional): Optional transform applied to the metadata dictionary. download (bool): If True, downloads the dataset if it does not exist locally (default=False). @@ -355,14 +370,15 @@ def __init__( ValueError: If the dataset directory does not contain any instance files. """ - self.dataset_dir = pathlib.Path(root) + self.dataset_dir = pathlib.Path(dataset_dir) self.extension = extension - if not self.origins: - from cpmpy.tools.datasets.config import get_origins - self.origins = get_origins(self.name) + # TODO: remove for later? + # if not self.origins: + # from cpmpy.tools.datasets.config import get_origins + # self.origins = get_origins(self.name) - if not self.dataset_dir.exists(): + if not self._check_exists(): if not download: raise ValueError("Dataset not found. Please set download=True to download the dataset.") else: @@ -377,6 +393,12 @@ def __init__( super().__init__(transform=transform, target_transform=target_transform) + def _check_exists(self) -> bool: + """ + Check if the dataset exists (has been downloaded). + """ + return self.dataset_dir.exists() + # ---------------------------------------------------------------------------- # # Methods to implement in subclasses: # # ---------------------------------------------------------------------------- # diff --git a/cpmpy/tools/datasets/mse.py b/cpmpy/tools/datasets/mse.py index 0f5e2aa7f..ebbc35b6f 100644 --- a/cpmpy/tools/datasets/mse.py +++ b/cpmpy/tools/datasets/mse.py @@ -29,9 +29,25 @@ class MaxSATEvalDataset(FileDataset): # torch.utils.data.Dataset compatible More information on the competition can be found here: https://maxsat-evaluations.github.io/ """ - name = "mse" - description = "MaxSAT Evaluation competition benchmark instances." - url = "https://maxsat-evaluations.github.io/" + # -------------------------- Dataset-level metadata -------------------------- # + + @property + def name(self) -> str: + return "maxsateval" + + @property + def description(self) -> str: + return "MaxSAT Evaluation competition benchmark instances." + + @property + def url(self) -> str: + return "https://maxsat-evaluations.github.io/" + + @property + def citation(self) -> List[str]: + return [] + + # ---------------------------------------------------------------------------- # def __init__( self, @@ -39,37 +55,39 @@ def __init__( year: int = 2024, track: str = "exact-unweighted", transform=None, target_transform=None, download: bool = False, + dataset_dir: Optional[os.PathLike] = None, metadata_workers: int = 1 ): """ Constructor for a dataset object of the MaxSAT Evaluation competition. Arguments: - root (str): Root directory where datasets are stored or will be downloaded to (default="."). + root (str): Root directory where datasets are stored or will be downloaded to (default="."). If `dataset_dir` is provided, this argument is ignored. year (int): Competition year of the dataset to use (default=2024). track (str): Track name specifying which subset of the competition instances to load (default="exact-unweighted"). transform (callable, optional): Optional transform applied to the instance file path. target_transform (callable, optional): Optional transform applied to the metadata dictionary. download (bool): If True, downloads the dataset if it does not exist locally (default=False). - + dataset_dir (Optional[os.PathLike]): Path to the dataset directory. If not provided, it will be inferred from the root and year/track. Raises: ValueError: If the dataset directory does not exist and `download=False`, or if the requested year/track combination is not available. """ + # Dataset-specific attributes self.root = pathlib.Path(root) self.year = year self.track = track - # Check requested dataset + # Check requested dataset is valid if not str(year).startswith('20'): raise ValueError("Year must start with '20'") if not track: raise ValueError("Track must be specified, e.g. OPT-LIN, DEC-LIN, ...") - dataset_dir = self.root / self.name / str(year) / track - + dataset_dir = pathlib.Path(dataset_dir) / str(year) / track if dataset_dir else self.root / self.name / str(year) / track + super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, From 0d88403cf5401ee3202f448519a8e7d932134033 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sun, 1 Mar 2026 15:15:01 +0100 Subject: [PATCH 122/152] _loader --- cpmpy/tools/datasets/_base.py | 30 +++++++----------------------- cpmpy/tools/datasets/mse.py | 19 ++++--------------- cpmpy/tools/datasets/xcsp3.py | 14 ++------------ 3 files changed, 13 insertions(+), 50 deletions(-) diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/datasets/_base.py index d9a29a198..c3c681382 100644 --- a/cpmpy/tools/datasets/_base.py +++ b/cpmpy/tools/datasets/_base.py @@ -405,23 +405,10 @@ def _check_exists(self) -> bool: @staticmethod @abstractmethod - def reader(file_path, open=open) -> cp.Model: + def _loader(content: str) -> cp.Model: """ - Reader for the dataset. - Parses a file path directly into a CPMpy model. - For backward compatibility. Consider using read() + load() instead. - """ - pass - - @staticmethod - @abstractmethod - def loader(content: str) -> cp.Model: - """ - Loader for the dataset. - Loads a CPMpy model from raw file content string. - - This is the "loading" step: turning raw contents into a CPMpy model. - The content should be the raw text content of the file (already decompressed). + Loader for the dataset. Loads a CPMpy model from raw file content string. + The content will be the raw text content of the file (already decompressed). Arguments: content (str): Raw file content string to load into a model. @@ -477,17 +464,15 @@ def open(cls, instance: os.PathLike) -> io.TextIOBase: instance (os.PathLike): File path to the instance file. Returns: - io.TextIOBase: The opened file. + io.TextIOBase: The opened file handle. """ return open(instance, "r") def read(self, instance: os.PathLike) -> str: """ Read raw file contents from an instance file. - Handles decompression automatically via dataset.open(). + Handles optional decompression automatically via dataset.open(). - This is the "reading" step: decompressing + reading raw file contents. - Arguments: instance (os.PathLike): File path to the instance file. Returns: @@ -500,9 +485,8 @@ def load(self, instance: Union[str, os.PathLike]) -> cp.Model: """ Load a CPMpy model from an instance file. - This is the "loading" step: uses `read()` to handle reading (decompressing + - reading raw contents) and then turns raw contents into a CPMpy model via `loader()`. - Loading always handles reading internally by calling `read()`. + Uses `.read()` to handle reading (decompressing + reading raw contents) and then turns + raw contents into a CPMpy model via `.loader()`. Arguments: instance (str or os.PathLike): diff --git a/cpmpy/tools/datasets/mse.py b/cpmpy/tools/datasets/mse.py index ebbc35b6f..bfc948fdc 100644 --- a/cpmpy/tools/datasets/mse.py +++ b/cpmpy/tools/datasets/mse.py @@ -7,11 +7,13 @@ import os import lzma +from typing import List, Optional import zipfile import pathlib import io import cpmpy as cp +from cpmpy.tools.io.wcnf import load_wcnf from cpmpy.tools.datasets._base import FileDataset @@ -97,23 +99,11 @@ def __init__( @staticmethod - def reader(file_path, open=open): + def _loader(content: str) -> cp.Model: """ - Reader for MSE dataset. - Parses a file path directly into a CPMpy model. - For backward compatibility. Consider using read() + load() instead. - """ - from cpmpy.tools.io.wcnf import load_wcnf - return load_wcnf(file_path, open=open) - - @staticmethod - def loader(content: str): - """ - Loader for MSE dataset. + Loader for MaxSAT Evaluation dataset. Loads a CPMpy model from raw WCNF content string. """ - from cpmpy.tools.io.wcnf import load_wcnf - # load_wcnf already supports raw strings return load_wcnf(content) def category(self) -> dict: @@ -154,7 +144,6 @@ def collect_instance_metadata(self, file) -> dict: return result def download(self): - url = f"https://www.cs.helsinki.fi/group/coreo/MSE{self.year}-instances/" target = f"mse{str(self.year)[2:]}-{self.track}.zip" target_download_path = self.root / target diff --git a/cpmpy/tools/datasets/xcsp3.py b/cpmpy/tools/datasets/xcsp3.py index 461bb83b2..a947c9b3a 100644 --- a/cpmpy/tools/datasets/xcsp3.py +++ b/cpmpy/tools/datasets/xcsp3.py @@ -57,18 +57,8 @@ def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transf ) - @classmethod - def reader(cls, file_path, open=open): - """ - Reader for XCSP3 dataset. - Parses a file path directly into a CPMpy model. - For backward compatibility. Consider using read() + load() instead. - """ - from cpmpy.tools.xcsp3.parser import load_xcsp3 - return load_xcsp3(file_path, open=open) - - @classmethod - def loader(cls, content: str): + @staticmethod + def _loader(content: str) -> cp.Model: """ Loader for XCSP3 dataset. Loads a CPMpy model from raw XCSP3 content string. From 783504bf00649d10531ca9ed47dc58ecf449c1b4 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sun, 1 Mar 2026 15:15:12 +0100 Subject: [PATCH 123/152] utils --- cpmpy/tools/datasets/utils.py | 148 ++++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 cpmpy/tools/datasets/utils.py diff --git a/cpmpy/tools/datasets/utils.py b/cpmpy/tools/datasets/utils.py new file mode 100644 index 000000000..ae48e8701 --- /dev/null +++ b/cpmpy/tools/datasets/utils.py @@ -0,0 +1,148 @@ +""" +Dataset utilities: generic download manager. + +Downloads one or multiple files from URLs. Supports optional parallel downloads +via a configurable worker count. How files are fetched (HTTP, progress bars, +chunking) is encapsulated here; datasets only pass (url, destination) and options. +""" + +import pathlib +import warnings +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import List, Tuple, Union +from urllib.request import Request, urlopen + + +def _get_content_length(url: str) -> int: + """Return Content-Length for url, or 0 if unknown.""" + try: + req = Request(url) + req.get_method = lambda: "HEAD" + with urlopen(req) as resp: + return int(resp.headers.get("Content-Length", 0)) + except Exception: + return 0 + + +def _download_url( + url: str, + destination: Union[str, pathlib.Path], + desc: str = None, + chunk_size: int = 1024 * 1024, + _sequential_impl=None, +) -> pathlib.Path: + """ + Download a single file from url to destination. + Uses _sequential_impl(url, path, total_size, desc, chunk_size) if provided, + otherwise delegates to the dataset base implementation. + """ + destination = pathlib.Path(destination) + destination.parent.mkdir(parents=True, exist_ok=True) + if desc is None: + desc = destination.name + total_size = _get_content_length(url) + if _sequential_impl is None: + from cpmpy.tools.dataset._base import _Dataset + _sequential_impl = _Dataset._download_sequential + _sequential_impl(url, destination, total_size, desc, chunk_size) + return destination + + +def download_manager( + url: Union[str, List[str]], + destination: Union[str, pathlib.Path, List[Union[str, pathlib.Path]]] = None, + *, + workers: int = 1, + desc_prefix: str = "Downloading", + chunk_size: int = 1024 * 1024, + skip_existing: bool = True, + **kwargs, +) -> Union[pathlib.Path, List[pathlib.Path]]: + """ + Generic download manager: one URL or many, sequential or parallel. + + Single file: + path = download("https://example.com/file.zip", "/tmp/file.zip") + path = download("https://example.com/file.zip", destination="/tmp/out.zip", workers=1) + + Multiple files (list of (url, destination)): + paths = download([("https://a.com/1.cnf", "/data/1.cnf"), ...], workers=4) + + Arguments: + url: Either a single URL string, or a list of URL strings. + destination: For single-URL mode, path to save the file. For multiple-URL mode, list of matching destination paths. + workers: Number of parallel download workers. 1 = sequential. >1 = parallel (only for multiple files). + desc_prefix: Prefix for progress description (e.g. "Instance 1/100"). + chunk_size: Chunk size in bytes for streaming. + skip_existing: If True, skip pairs where destination already exists (multi-file only). + **kwargs: Ignored; allows callers to pass through options (e.g. from dataset download(**kwargs)). + + Returns: + For single URL: path to the downloaded file. + For multiple: list of paths that were downloaded (skipped files are not in the list). + """ + if isinstance(url, str): + if destination is None: + raise ValueError("destination is required when passing a single URL") + return _download_url(url, destination, desc=desc_prefix or url, chunk_size=chunk_size) + + items: List[Tuple[str, pathlib.Path]] = [ + (url, pathlib.Path(dest)) for url, dest in zip(url, destination) + ] + + if not items: + return [] + + if skip_existing: + items = [(u, d) for u, d in items if not d.exists()] + + if not items: + return [] + + if workers is None or workers <= 1: + # Sequential + results = [] + for i, (url, dest) in enumerate(items): + desc = f"{desc_prefix} {i + 1}/{len(items)} {dest.name}" + try: + results.append(_download_url(url, dest, desc=desc, chunk_size=chunk_size)) + except Exception as e: + warnings.warn(f"Failed to download {url}: {e}") + return results + + # Parallel + max_workers = min(workers, len(items)) + results = [] + errors = [] + + def do_one(url: str, dest: pathlib.Path, idx: int) -> Tuple: + desc = f"{desc_prefix} {idx + 1}/{len(items)} {dest.name}" + try: + return _download_url(url, dest, desc=desc, chunk_size=chunk_size), None + except Exception as e: + return None, str(e) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit(do_one, url, dest, i): (url, dest) + for i, (url, dest) in enumerate(items) + } + for future in as_completed(futures): + result, err = future.result() + if result is not None: + results.append(result) + else: + url, dest = futures[future] + errors.append((dest.name, err)) + + if errors: + warnings.warn( + f"Failed to download {len(errors)}/{len(items)} files. " + f"First error: {errors[0][0]} - {errors[0][1]}" + ) + + return results + + +# Convenience alias for multi-file callers +download_many = download_manager From a76560fa17539a9e9689429059d3b7f51dee0271 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sun, 1 Mar 2026 15:15:47 +0100 Subject: [PATCH 124/152] Fix paths --- cpmpy/tools/datasets/_base.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/datasets/_base.py index c3c681382..63553aaa2 100644 --- a/cpmpy/tools/datasets/_base.py +++ b/cpmpy/tools/datasets/_base.py @@ -509,18 +509,23 @@ def load(self, instance: Union[str, os.PathLike]) -> cp.Model: return self.loader(content) - - - - # ---------------------------------------------------------------------------- # # Public interface # # ---------------------------------------------------------------------------- # - def instance_metadata(self, file: pathlib.Path) -> dict: + def instance_metadata(self, file: os.PathLike) -> dict: + """ + Return the metadata for a given instance file. + + Arguments: + file (os.PathLike): Path to the instance file. + + Returns: + dict: The metadata for the instance. + """ metadata = self.category() | { 'dataset': self.name, - 'name': pathlib.Path(file).stem.replace(self.extension, ''), + 'name': pathlib.Path(file).name.replace(self.extension, ''), 'path': file, } # Load sidecar metadata if it exists @@ -598,19 +603,19 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: # ---------------------------- Metadata collection --------------------------- # - def _metadata_path(self, instance_path: pathlib.Path) -> pathlib.Path: + def _metadata_path(self, instance_path: os.PathLike) -> pathlib.Path: """ Return the path to the `.meta.json` sidecar file for a given instance. Arguments: - instance_path: path to the instance file + instance_path (os.PathLike): Path to the instance file. Returns: - path to the `.meta.json` sidecar file + pathlib.Path: Path to the `.meta.json` sidecar file. """ return pathlib.Path(str(instance_path) + self.METADATA_EXTENSION) - def _collect_all_metadata(self, force=False, workers=1): + def _collect_all_metadata(self, force: bool = False, workers: int = 1): """ Collect and store structured metadata sidecar files for all instances. @@ -901,7 +906,7 @@ def download_one(url_suffix: str, target: str) -> Tuple[Optional[pathlib.Path], return downloaded_files @staticmethod - def _download_sequential(url: str, filepath: pathlib.Path, total_size: int, desc: str, + def _download_sequential(url: str, filepath: os.PathLike, total_size: int, desc: str, chunk_size: int = 1024 * 1024): """Download file sequentially with progress bar.""" import sys From 86eaf16996f534818b0905d12dc0f6eac6c281f1 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sun, 1 Mar 2026 15:16:22 +0100 Subject: [PATCH 125/152] Files and generator datasets --- cpmpy/tools/datasets/_base.py | 59 ++++++++++++++++++++++++++++++ cpmpy/tools/datasets/transforms.py | 20 +++++----- 2 files changed, 69 insertions(+), 10 deletions(-) diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/datasets/_base.py index 63553aaa2..9a3f32f67 100644 --- a/cpmpy/tools/datasets/_base.py +++ b/cpmpy/tools/datasets/_base.py @@ -11,6 +11,7 @@ Additionaly, it provides a collection of methods and helper functions to adapt the dataset to the specific usecase requirements of constraint optimisation benchmarks. """ +from __future__ import annotations from abc import ABC, abstractmethod import json @@ -305,6 +306,20 @@ def __iter__(self) -> Iterator[Tuple[Any, Any]]: """ pass + @staticmethod + def from_generator(generator: callable) -> IterableDataset: + """ + Create an IterableDataset from a generator. + """ + class FromGeneratorDataset(IterableDataset): + def __init__(self, generator: callable): + self.generator = generator + + def __iter__(self): + return self.generator() + + return FromGeneratorDataset(generator) + class FileDataset(IndexedDataset): """ Abstract base class for PyTorch-style datasets of CO benchmarking instances. @@ -543,6 +558,9 @@ def instance_metadata(self, file: os.PathLike) -> dict: def dataset_metadata(cls) -> dict: """ Return dataset-level metadata as a dictionary. + + Returns: + dict: The dataset-level metadata. """ if isinstance(cls.citation, str): citations = [cls.citation] if cls.citation else [] @@ -963,7 +981,48 @@ def _download_sequential(url: str, filepath: os.PathLike, total_size: int, desc: sys.stdout.write("\n") sys.stdout.flush() +def from_files(dataset_dir: os.PathLike, extension: str = ".txt") -> FileDataset: + """ + Create a FileDataset from a list of files. + """ + class FromFilesDataset(FileDataset): + def __init__(self, dataset_dir: os.PathLike, extension: str = ".txt"): + super().__init__(dataset_dir=dataset_dir, extension=extension) + + @property + def name(self) -> str: + raise NotImplementedError("Arbitrary file dataset does not support a name. Please implement this method in a subclass, or use a more specific dataset class.") + + @property + def description(self) -> str: + raise NotImplementedError("Arbitrary file dataset does not support a description. Please implement this method in a subclass, or use a more specific dataset class.") + + @property + def url(self) -> str: + raise NotImplementedError("Arbitrary file dataset does not support a URL. Please implement this method in a subclass, or use a more specific dataset class.") + + @property + def citation(self) -> List[str]: + raise NotImplementedError("Arbitrary file dataset does not support a citation. Please implement this method in a subclass, or use a more specific dataset class.") + + def _loader(self, file: os.PathLike) -> cp.Model: + raise NotImplementedError("Arbitrary file dataset does not support loading. Please implement this method in a subclass, or use a more specific dataset class.") + + def category(self) -> dict: + raise NotImplementedError("Arbitrary file dataset does not support categories. Please implement this method in a subclass, or use a more specific dataset class.") + + def download(self) -> None: + raise NotImplementedError("Arbitrary file dataset does not support downloading. Please implement this method in a subclass, or use a more specific dataset class.") + + def instance_metadata(self, file: os.PathLike) -> dict: + metadata = { + 'dataset_dir': str(self.dataset_dir), + 'name': pathlib.Path(file).name.replace(self.extension, ''), + 'path': file, + } + return metadata + return FromFilesDataset(dataset_dir, extension) class URLDataset(IndexedDataset): """ diff --git a/cpmpy/tools/datasets/transforms.py b/cpmpy/tools/datasets/transforms.py index bfcff8ba2..8421d022f 100644 --- a/cpmpy/tools/datasets/transforms.py +++ b/cpmpy/tools/datasets/transforms.py @@ -42,6 +42,8 @@ import os import re +import cpmpy as cp + _builtins_open = open # capture before any parameter shadowing @@ -125,7 +127,7 @@ def extract_format_metadata(content, format_name): return result -def _enrich_from_model(model, metadata): +def metadata_from_model(model): """Add decision variable and objective info from a CPMpy Model to metadata. This is called by transforms that produce CPMpy models (Load, Translate) @@ -135,22 +137,20 @@ def _enrich_from_model(model, metadata): - ``objective``: string representation of the objective expression (if any) - ``objective_is_min``: True if minimizing, False if maximizing (if any) """ - if not hasattr(model, 'constraints'): + + metadata = {} + + if not isinstance(model, cp.Model): return metadata # not a CPMpy Model from cpmpy.transformations.get_variables import get_variables_model from cpmpy.expressions.variables import _BoolVarImpl variables = get_variables_model(model) - metadata['decision_variables'] = [ - { - "name": v.name, - "type": "bool" if isinstance(v, _BoolVarImpl) else "int", - "lb": int(v.lb), - "ub": int(v.ub), - } + metadata['decision_variables'] = { + v.name: v for v in variables - ] + } if model.objective_ is not None: metadata['objective'] = str(model.objective_) From ee5a0305c7f904beaf607372d0a0e45e3041a8ab Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sun, 1 Mar 2026 15:16:37 +0100 Subject: [PATCH 126/152] SAT dataset --- cpmpy/tools/datasets/sat.py | 199 ++++++++++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 cpmpy/tools/datasets/sat.py diff --git a/cpmpy/tools/datasets/sat.py b/cpmpy/tools/datasets/sat.py new file mode 100644 index 000000000..770539564 --- /dev/null +++ b/cpmpy/tools/datasets/sat.py @@ -0,0 +1,199 @@ +""" +SAT Competition Dataset + +Instances from the benchmark database (benchmark-database.de) for the SAT competition. +""" + +import io +import lzma +import os +import pathlib +import re +import tempfile +from urllib.request import Request, urlopen + +from cpmpy.tools.dataset._base import URLDataset +from cpmpy.tools.dataset.utils import download as download_manager + + +# Base URL for the instance list (getinstances returns one file URL per line) +INSTANCE_LIST_URL = "https://benchmark-database.de/getinstances" +DEFAULT_QUERY = "track=main_2025" +DEFAULT_CONTEXT = "cnf" + + +class SATDataset(URLDataset): + """ + SAT competition benchmark dataset (DIMACS CNF). + + Instances are listed at benchmark-database.de via getinstances; each line + is a URL to a CNF file (served XZ-compressed). Files are stored as .cnf.xz. + + More information: https://benchmark-database.de/ + """ + + name = "sat" + description = "SAT competition benchmark instances (DIMACS CNF) from benchmark-database.de." + url = "https://benchmark-database.de/" + license = "" + citation = [] + + def __init__( + self, + root: str = ".", + track: str = "main_2025", + context: str = "cnf", + transform=None, + target_transform=None, + download: bool = False, + **kwargs + ): + """ + Constructor for the SAT competition dataset. + + Arguments: + root (str): Root directory where the dataset is stored or will be downloaded (default="."). + track (str): Track query parameter for getinstances (default="main_2025"). + context (str): Context query parameter for getinstances (default="cnf"). + transform (callable, optional): Optional transform applied to the instance file path. + target_transform (callable, optional): Optional transform applied to the metadata dict. + download (bool): If True, download the instance list and all instances if not present (default=False). + **kwargs: Passed through to download() (e.g. workers for parallel downloads). + """ + self.root = pathlib.Path(root) + self.track = track + self.context = context + + dataset_dir = self.root / self.name / track / context + + super().__init__( + dataset_dir=dataset_dir, + transform=transform, + target_transform=target_transform, + download=download, + extension=".cnf.xz", + **kwargs + ) + + @staticmethod + def reader(file_path, open=open): + """ + Reader for SAT dataset. + Parses a DIMACS CNF file path into a CPMpy model (uses open for .cnf.xz). + """ + with open(file_path) as f: + content = f.read() + return SATDataset.loader(content) + + @staticmethod + def loader(content: str): + """ + Loader for SAT dataset. + Loads a CPMpy model from raw DIMACS CNF content string. + """ + from cpmpy.tools.dimacs import load_dimacs + with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".cnf") as tmp: + tmp.write(content) + tmp_path = tmp.name + try: + return load_dimacs(tmp_path) + finally: + os.unlink(tmp_path) + + def open(self, instance: os.PathLike) -> io.TextIOBase: + """Open instance file; use lzma for .cnf.xz (XZ-compressed) files.""" + path = str(instance) + return lzma.open(instance, "rt") if path.endswith(".xz") else open(instance, "r") + + def instance_metadata(self, file: pathlib.Path) -> dict: + """Add instance metadata; ensure name strips .cnf from stem (e.g. hash.cnf.xz -> hash).""" + metadata = super().instance_metadata(file) + stem = pathlib.Path(file).stem + if stem.endswith(".cnf"): + metadata["name"] = stem[:-4] + return metadata + + def category(self) -> dict: + return { + "track": self.track, + "context": self.context, + } + + def collect_instance_metadata(self, file) -> dict: + """Extract num variables and num clauses from DIMACS p-line.""" + result = {} + try: + with self.open(file) as f: + for line in f: + line = line.strip() + if line.startswith("p"): + match = re.search(r"p\s+cnf\s+(\d+)\s+(\d+)", line) + if match: + result["dimacs_num_variables"] = int(match.group(1)) + result["dimacs_num_clauses"] = int(match.group(2)) + break + except Exception: + pass + return result + + def download(self, **kwargs): + """Fetch the instance list from getinstances, then download each CNF file via the download manager.""" + params = f"query=track%3D{self.track}&context={self.context}" + list_url = f"{INSTANCE_LIST_URL}?{params}" + + print(f"Fetching SAT instance list from {list_url}") + req = Request(list_url) + with urlopen(req) as response: + body = response.read().decode("utf-8") + + # One file URL per line (e.g. http://benchmark-database.de/file/00d5a43a...) + file_urls = [line.strip() for line in body.splitlines() if line.strip()] + + if not file_urls: + raise ValueError( + f"No instances returned from {list_url}. " + "Check track and context parameters." + ) + + # Use last path segment (hash) as filename; store as .cnf.xz (server sends XZ-compressed) + def path_to_name(url: str) -> str: + name = url.rstrip("/").split("/")[-1] + if name.lower().endswith(".cnf.xz"): + return name + if name.lower().endswith(".cnf"): + return f"{name}.xz" + return f"{name}.cnf.xz" + + self.dataset_dir.mkdir(parents=True, exist_ok=True) + + # Deduplicate by destination (instance list may contain duplicate URLs) + seen_dest = set() + items = [] + for url in file_urls: + dest = self.dataset_dir / path_to_name(url) + if dest not in seen_dest: + seen_dest.add(dest) + items.append((url, dest)) + + workers = kwargs.get("workers", 1) + print(f"Downloading {len(items)} SAT instances to {self.dataset_dir} (workers={workers})") + download_manager( + items, + desc_prefix="Instance", + skip_existing=True, + **kwargs, + ) + + files = self._list_instances() + if not files: + raise ValueError( + f"Download completed but no .cnf.xz files found in {self.dataset_dir}" + ) + self._collect_all_metadata() + print(f"Finished downloading {len(files)} instances") + + +if __name__ == "__main__": + dataset = SATDataset(track="main_2025", context="cnf", download=True) + print("Dataset size:", len(dataset)) + print("Instance 0:", dataset[0]) From 5335c4371a4718ba52cb44a3a33d58f102251b79 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sun, 1 Mar 2026 15:33:48 +0100 Subject: [PATCH 127/152] More expressive generator datasets --- cpmpy/tools/datasets/__init__.py | 7 ++- cpmpy/tools/datasets/_base.py | 102 ++++++++++++++++++++++++++++--- 2 files changed, 101 insertions(+), 8 deletions(-) diff --git a/cpmpy/tools/datasets/__init__.py b/cpmpy/tools/datasets/__init__.py index 01c75630f..73336dc96 100644 --- a/cpmpy/tools/datasets/__init__.py +++ b/cpmpy/tools/datasets/__init__.py @@ -1,4 +1,9 @@ -from ._base import extract_model_features, portable_instance_metadata, FileDataset +from ._base import ( + extract_model_features, + expand_varying_kwargs, + portable_instance_metadata, + FileDataset, +) from .miplib import MIPLibDataset from .jsplib import JSPLibDataset from .psplib import PSPLibDataset diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/datasets/_base.py index 9a3f32f67..3a4a80897 100644 --- a/cpmpy/tools/datasets/_base.py +++ b/cpmpy/tools/datasets/_base.py @@ -20,6 +20,7 @@ import io import tempfile import warnings +from itertools import product from typing import Any, Iterator, Optional, Tuple, List, Union, Callable from urllib.error import URLError from urllib.request import HTTPError, Request, urlopen @@ -289,6 +290,45 @@ def __iter__(self): for i in range(len(self)): yield self[i] + +def expand_varying_kwargs( + vary: Union[str, List[str]], + gen_kwargs: dict, + mode: str = "zip", +) -> Iterator[dict]: + """ + Expand gen_kwargs into a sequence of kwargs dicts for varying parameters. + + When ``vary`` is a single string, yields one kwargs dict per value in + ``gen_kwargs[vary]``. + + When ``vary`` is a list of strings, each corresponding value in gen_kwargs + must be an iterable. Yields one kwargs dict per tuple: + - ``mode='zip'``: parallel iteration (zip), all iterables must have same length + - ``mode='product'``: Cartesian product over the varying dimensions + + Arguments: + vary: Name(s) of keys in gen_kwargs whose values are iterables to vary over. + gen_kwargs: Base kwargs; keys in vary are replaced per iteration. + mode: ``'zip'`` (default) or ``'product'``. + + Yields: + dict: Full kwargs for each generator call. + """ + varying_keys = [vary] if isinstance(vary, str) else list(vary) + base_kwargs = {k: v for k, v in gen_kwargs.items() if k not in varying_keys} + varying_iters = [gen_kwargs[k] for k in varying_keys] + + if mode == "zip": + for values in zip(*varying_iters): + yield {**base_kwargs, **dict(zip(varying_keys, values))} + elif mode == "product": + for values in product(*varying_iters): + yield {**base_kwargs, **dict(zip(varying_keys, values))} + else: + raise ValueError(f"mode must be 'zip' or 'product', got {mode!r}") + + class IterableDataset(Dataset): """ Abstract base class for iterable datasets. @@ -307,18 +347,66 @@ def __iter__(self) -> Iterator[Tuple[Any, Any]]: pass @staticmethod - def from_generator(generator: callable) -> IterableDataset: + def from_generator( + generator: Callable, + gen_kwargs: Optional[dict] = None, + vary: Optional[Union[str, List[str]]] = None, + vary_mode: str = "zip", + ) -> IterableDataset: """ Create an IterableDataset from a generator. + + Arguments: + generator: Callable that returns an iterator yielding (x, y) pairs. + When ``vary`` is None, called as ``generator()`` or + ``generator(**gen_kwargs)``. When ``vary`` is set, called once + per value (or tuple of values) of the varying kwarg(s). + gen_kwargs: Optional dict of keyword arguments to pass to the generator. + vary: Optional name or list of names of keys in gen_kwargs whose values + are iterables. If a single string, the generator is called once per + value. If a list of strings, the generator is called once per tuple + from zip (default) or product of the iterables. + vary_mode: When ``vary`` is a list, ``'zip'`` (parallel iteration, + same-length iterables) or ``'product'`` (Cartesian product). """ - class FromGeneratorDataset(IterableDataset): - def __init__(self, generator: callable): - self.generator = generator + gen_kwargs = gen_kwargs or {} + + if vary is not None: + # Variant: call generator once per expanded kwargs + class FromGeneratorVariedDataset(IterableDataset): + def __init__( + self, + generator: Callable, + gen_kwargs: dict, + vary: Union[str, List[str]], + vary_mode: str, + ): + self.generator = generator + self.gen_kwargs = gen_kwargs + self.vary = vary + self.vary_mode = vary_mode + + def __iter__(self): + for kwargs in expand_varying_kwargs( + self.vary, self.gen_kwargs, mode=self.vary_mode + ): + for item in self.generator(**kwargs): + yield item + + return FromGeneratorVariedDataset( + generator, gen_kwargs, vary, vary_mode + ) + else: + # Original: single call to generator + class FromGeneratorDataset(IterableDataset): + def __init__(self, generator: Callable, gen_kwargs: dict): + self.generator = generator + self.gen_kwargs = gen_kwargs - def __iter__(self): - return self.generator() + def __iter__(self): + return self.generator(**self.gen_kwargs) - return FromGeneratorDataset(generator) + return FromGeneratorDataset(generator, gen_kwargs) class FileDataset(IndexedDataset): """ From c46cf03c7f2bb5677db2d0eb6b09cfe5dc9aabc4 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Sun, 1 Mar 2026 15:59:06 +0100 Subject: [PATCH 128/152] Dataset metadata classproperty --- cpmpy/tools/datasets/_base.py | 25 +++++++++++++++++++------ cpmpy/tools/datasets/mse.py | 12 +++++++----- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/datasets/_base.py index 3a4a80897..bbe79d83b 100644 --- a/cpmpy/tools/datasets/_base.py +++ b/cpmpy/tools/datasets/_base.py @@ -63,6 +63,19 @@ def _format_bytes(bytes_num): bytes_num /= 1024.0 +class classproperty: + """ + Descriptor that makes a method work as a class-level property (no () needed). + Similar to @property, but for class methods. + """ + + def __init__(self, func): + self.func = func + + def __get__(self, instance, owner): + return self.func(owner) + + def portable_instance_metadata(metadata: dict) -> dict: """ Filter sidecar metadata to only portable, domain-specific fields. @@ -423,19 +436,19 @@ class FileDataset(IndexedDataset): # -------------- Dataset-level metadata (override in subclasses) ------------- # - @property + @classproperty @abstractmethod def name(self) -> str: pass - @property + @classproperty @abstractmethod def description(self) -> str: pass - @property + @classproperty @abstractmethod def url(self) -> str: pass - @property + @classproperty def citation(self) -> List[str]: return [] @@ -626,8 +639,9 @@ def instance_metadata(self, file: os.PathLike) -> dict: Returns: dict: The metadata for the instance. """ - metadata = self.category() | { + metadata = { 'dataset': self.name, + 'category': self.category(), 'name': pathlib.Path(file).name.replace(self.extension, ''), 'path': file, } @@ -659,7 +673,6 @@ def dataset_metadata(cls) -> dict: "name": cls.name, "description": cls.description, "url": cls.url, - "license": cls.license, "citation": citations, } diff --git a/cpmpy/tools/datasets/mse.py b/cpmpy/tools/datasets/mse.py index bfc948fdc..65fb346a7 100644 --- a/cpmpy/tools/datasets/mse.py +++ b/cpmpy/tools/datasets/mse.py @@ -14,7 +14,7 @@ import cpmpy as cp from cpmpy.tools.io.wcnf import load_wcnf -from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets._base import FileDataset, classproperty class MaxSATEvalDataset(FileDataset): # torch.utils.data.Dataset compatible @@ -33,19 +33,21 @@ class MaxSATEvalDataset(FileDataset): # torch.utils.data.Dataset compatible # -------------------------- Dataset-level metadata -------------------------- # - @property + _metadata_init_kwargs = {"year": 2024, "track": "exact-unweighted"} + + @classproperty def name(self) -> str: return "maxsateval" - @property + @classproperty def description(self) -> str: return "MaxSAT Evaluation competition benchmark instances." - @property + @classproperty def url(self) -> str: return "https://maxsat-evaluations.github.io/" - @property + @classproperty def citation(self) -> List[str]: return [] From ea40e28bf48c6443b1095e880ff9b01b36803efc Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Mon, 2 Mar 2026 09:51:02 +0100 Subject: [PATCH 129/152] Vastly expanded metadata system based on industry best practices --- cpmpy/tools/datasets/__init__.py | 41 ++ cpmpy/tools/datasets/_base.py | 144 ++++- cpmpy/tools/datasets/jsplib.py | 29 +- cpmpy/tools/datasets/metadata.py | 734 +++++++++++++++++++++++++ cpmpy/tools/datasets/miplib.py | 10 +- cpmpy/tools/datasets/mse.py | 50 +- cpmpy/tools/datasets/nurserostering.py | 14 +- cpmpy/tools/datasets/opb.py | 28 +- cpmpy/tools/datasets/psplib.py | 28 +- cpmpy/tools/datasets/transforms.py | 9 + cpmpy/tools/datasets/xcsp3.py | 18 +- 11 files changed, 1027 insertions(+), 78 deletions(-) create mode 100644 cpmpy/tools/datasets/metadata.py diff --git a/cpmpy/tools/datasets/__init__.py b/cpmpy/tools/datasets/__init__.py index 73336dc96..2ef1bd91a 100644 --- a/cpmpy/tools/datasets/__init__.py +++ b/cpmpy/tools/datasets/__init__.py @@ -4,6 +4,47 @@ portable_instance_metadata, FileDataset, ) +from .metadata import ( + InstanceInfo, + DatasetInfo, + FeaturesInfo, + FieldInfo, + to_croissant_example, + to_gbd_features, +) + +__all__ = [ + # Base + "FileDataset", + "extract_model_features", + "expand_varying_kwargs", + "portable_instance_metadata", + # Metadata + "InstanceInfo", + "DatasetInfo", + "FeaturesInfo", + "FieldInfo", + "to_croissant_example", + "to_gbd_features", + # Datasets + "MIPLibDataset", + "JSPLibDataset", + "PSPLibDataset", + "NurseRosteringDataset", + "XCSP3Dataset", + "OPBDataset", + "MaxSATEvalDataset", + # Transforms + "Compose", + "Open", + "Load", + "Parse", + "Serialize", + "Translate", + "SaveToFile", + "Lambda", + "extract_format_metadata", +] from .miplib import MIPLibDataset from .jsplib import JSPLibDataset from .psplib import PSPLibDataset diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/datasets/_base.py index bbe79d83b..6320a5e6e 100644 --- a/cpmpy/tools/datasets/_base.py +++ b/cpmpy/tools/datasets/_base.py @@ -37,18 +37,16 @@ import cpmpy as cp -# TODO: move elsewhere? -# Fields produced by extract_model_features() (after loading into a CPMpy model) -# - not portable across format translations -_MODEL_FEATURE_FIELDS = frozenset({ - "num_variables", "num_bool_variables", "num_int_variables", - "num_constraints", "constraint_types", "has_objective", - "objective_type", "domain_size_min", "domain_size_max", "domain_size_mean", -}) +from .metadata import ( + InstanceInfo, DatasetInfo, FeaturesInfo, FieldInfo, + _MODEL_FEATURE_FIELDS, _FORMAT_SPECIFIC_PREFIXES, +) -# TODO: move elsewhere? -# Prefixes for format-specific metadata fields (not portable across translations) -_FORMAT_SPECIFIC_PREFIXES = ("opb_", "wcnf_", "mps_", "xcsp_", "dimacs_") +# Re-export constants for backward compatibility with code that imports from _base +__all__ = [ + "_MODEL_FEATURE_FIELDS", "_FORMAT_SPECIFIC_PREFIXES", + "InstanceInfo", "DatasetInfo", "FeaturesInfo", "FieldInfo", +] def _format_bytes(bytes_num): @@ -71,6 +69,7 @@ class classproperty: def __init__(self, func): self.func = func + self.__isabstractmethod__ = getattr(func, '__isabstractmethod__', False) def __get__(self, instance, owner): return self.func(owner) @@ -435,7 +434,7 @@ class FileDataset(IndexedDataset): METADATA_EXTENSION = ".meta.json" # -------------- Dataset-level metadata (override in subclasses) ------------- # - + @classproperty @abstractmethod def name(self) -> str: pass @@ -446,19 +445,64 @@ def description(self) -> str: pass @classproperty @abstractmethod - def url(self) -> str: pass + def homepage(self) -> str: pass @classproperty - def citation(self) -> List[str]: + def citation(self) -> List[str]: return [] - # TODO: remove for now? + # Optional enrichment — all have sensible defaults, zero lines required + version: Optional[str] = None # e.g. "2024", "1.0.0" + license: Optional[Union[str, List[str]]] = None # e.g. "MIT", ["CC BY 4.0"] + domain: str = "constraint_programming" # e.g. "scheduling", "sat" + tags: List[str] = [] # e.g. ["optimization", "scheduling"] + language: Optional[str] = None # e.g. "XCSP3", "OPB", "JSPLib" + features: Optional[FeaturesInfo] = None # domain_metadata field schema + release_notes: Optional[dict] = None # {version: changelog} + # Multiple download origins (override in subclasses or via config) - # Origins are tried in order, falling back to original url if all fail - origins: List[str] = [] # List of URL bases to try before falling back to original url + # Origins are tried in order, falling back to homepage if all fail + origins: List[str] = [] # ---------------------------------------------------------------------------- # + def __init_subclass__(cls, **kwargs): + """ + Auto-merge ``features`` when a subclass declares only its *new* fields. + + If a subclass explicitly defines ``features``, it is merged with the + nearest ancestor's ``features`` so the subclass only needs to list + what is new. The subclass fields take precedence over inherited ones. + + .. code-block:: python + + class MyJSPDataset(JSPLibDataset): + # No need to repeat {jobs, machines, optimum, ...} — they are + # inherited and merged in automatically. + features = FeaturesInfo({"difficulty": ("float", "Computed difficulty score")}) + + def collect_instance_metadata(self, file): + meta = super().collect_instance_metadata(file) + meta["difficulty"] = ... + return meta + + To *replace* rather than extend the parent schema, explicitly set + ``features`` to the complete schema you want (the auto-merge still + runs, but if you start from scratch the parent's fields will be + absent from the parent's FeaturesInfo and won't be merged). + Alternatively, set ``features = None`` to clear the schema entirely. + """ + super().__init_subclass__(**kwargs) + subclass_features = cls.__dict__.get("features") + if subclass_features is None: + return + # Walk the MRO to find the nearest ancestor that has features defined + for base in cls.__mro__[1:]: + parent_features = base.__dict__.get("features") + if parent_features is not None: + cls.features = parent_features | subclass_features + return + def __init__( self, @@ -622,22 +666,27 @@ def load(self, instance: Union[str, os.PathLike]) -> cp.Model: content = instance # Loading - turn raw contents into CPMpy model - return self.loader(content) + return self._loader(content) # ---------------------------------------------------------------------------- # # Public interface # # ---------------------------------------------------------------------------- # - def instance_metadata(self, file: os.PathLike) -> dict: + def instance_metadata(self, file: os.PathLike) -> InstanceInfo: """ Return the metadata for a given instance file. + Returns an :class:`~metadata.InstanceInfo`, which is a ``dict`` subclass + so all existing ``meta['year']``, ``meta.get('jobs')`` access is unchanged. + Structured access via ``info.domain_metadata``, ``info.model_features``, + ``info.id``, etc. is additive. + Arguments: file (os.PathLike): Path to the instance file. Returns: - dict: The metadata for the instance. + InstanceInfo: The metadata for the instance. """ metadata = { 'dataset': self.name, @@ -654,27 +703,68 @@ def instance_metadata(self, file: os.PathLike) -> dict: metadata.update(sidecar.get("instance_metadata", {})) metadata.update(sidecar.get("format_metadata", {})) metadata.update(sidecar.get("model_features", {})) - return metadata + return InstanceInfo(metadata) @classmethod - def dataset_metadata(cls) -> dict: + def dataset_metadata(cls) -> DatasetInfo: """ - Return dataset-level metadata as a dictionary. + Return dataset-level metadata as a :class:`~metadata.DatasetInfo`. + + :class:`~metadata.DatasetInfo` is a ``dict`` subclass, so existing + ``dataset_metadata()['name']`` access continues to work unchanged. + New structured access (``dataset_metadata().card()``, + ``dataset_metadata().to_croissant()``, etc.) is additive. Returns: - dict: The dataset-level metadata. + DatasetInfo: The dataset-level metadata. """ if isinstance(cls.citation, str): citations = [cls.citation] if cls.citation else [] else: citations = list(cls.citation) - return { + # Serialise FeaturesInfo to a plain dict so the DatasetInfo is JSON-safe + # (the DatasetInfo.features property reconstructs FeaturesInfo on access) + features_dict = None + if cls.features is not None: + features_dict = cls.features.to_dict() + + return DatasetInfo({ "name": cls.name, "description": cls.description, - "url": cls.url, + "url": cls.homepage, # backward-compat key + "homepage": cls.homepage, # HuggingFace / TFDS naming "citation": citations, - } + "version": cls.version, + "license": cls.license, + "domain": cls.domain, + "tags": list(cls.tags), + "language": cls.language, + "features": features_dict, + "release_notes": cls.release_notes, + }) + + @classmethod + def card(cls, format: str = "markdown") -> str: + """ + Generate a dataset card for this dataset. + + Shorthand for ``cls.dataset_metadata().card(format=format)``. + + Follows HuggingFace Hub convention: YAML frontmatter (machine-readable) + followed by a markdown body (human-readable). + + Parameters + ---------- + format: + Only ``"markdown"`` is currently supported. + + Returns + ------- + str + The dataset card as a string. + """ + return cls.dataset_metadata().card(format=format) # ---------------------------------------------------------------------------- # diff --git a/cpmpy/tools/datasets/jsplib.py b/cpmpy/tools/datasets/jsplib.py index 365151876..9f4ec507f 100644 --- a/cpmpy/tools/datasets/jsplib.py +++ b/cpmpy/tools/datasets/jsplib.py @@ -15,6 +15,7 @@ import cpmpy as cp from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.metadata import FeaturesInfo, FieldInfo class JSPLibDataset(FileDataset): # torch.utils.data.Dataset compatible @@ -27,7 +28,7 @@ class JSPLibDataset(FileDataset): # torch.utils.data.Dataset compatible name = "jsplib" description = "Job Shop Scheduling Problem benchmark library." - url = "https://github.com/tamy0612/JSPLIB" + homepage = "https://github.com/tamy0612/JSPLIB" citation = [ "J. Adams, E. Balas, D. Zawack. 'The shifting bottleneck procedure for job shop scheduling.', Management Science, Vol. 34, Issue 3, pp. 391-401, 1988.", "J.F. Muth, G.L. Thompson. 'Industrial scheduling.', Englewood Cliffs, NJ, Prentice-Hall, 1963.", @@ -38,6 +39,19 @@ class JSPLibDataset(FileDataset): # torch.utils.data.Dataset compatible "E. Taillard. 'Benchmarks for basic scheduling problems', European Journal of Operational Research, Vol. 64, Issue 2, pp. 278-285, 1993.", ] + version = "1.0.0" + license = "MIT" + domain = "scheduling" + tags = ["optimization", "job-shop-scheduling", "scheduling", "combinatorial"] + language = "JSPLib" + features = FeaturesInfo({ + "jobs": ("int", "Number of jobs"), + "machines": ("int", "Number of machines"), + "optimum": FieldInfo("int", "Known optimal makespan, if available", nullable=True), + "bounds": FieldInfo("dict", "Upper/lower bounds on the optimal makespan", nullable=True), + "instance_description": FieldInfo("str", "Human-readable description from file header comments", nullable=True), + }) + def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False, metadata_workers: int = 1): """ Initialize the JSPLib Dataset. @@ -62,19 +76,8 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl metadata_workers=metadata_workers ) - - @staticmethod - def reader(file_path, open=open): - """ - Reader for JSPLib dataset. - Parses a file path directly into a CPMpy model. - For backward compatibility. Consider using read() + load() instead. - """ - from cpmpy.tools.io.jsplib import load_jsplib - return load_jsplib(file_path, open=open) - @staticmethod - def loader(content: str): + def _loader(content: str): """ Loader for JSPLib dataset. Loads a CPMpy model from raw JSPLib content string. diff --git a/cpmpy/tools/datasets/metadata.py b/cpmpy/tools/datasets/metadata.py new file mode 100644 index 000000000..134019d25 --- /dev/null +++ b/cpmpy/tools/datasets/metadata.py @@ -0,0 +1,734 @@ +""" +Structured Metadata Classes for CPMpy Datasets + +Provides: +- :class:`FieldInfo` — schema for one domain metadata field +- :class:`FeaturesInfo` — schema for all domain metadata fields of a dataset +- :class:`InstanceInfo` — dict-compatible per-instance metadata with structured access +- :class:`DatasetInfo` — dict-compatible dataset-level metadata with card/Croissant export +- :func:`to_croissant_example` — adapter for use as ``target_transform`` +- :func:`to_gbd_features` — adapter for use as ``target_transform`` + +Design notes +------------ +``InstanceInfo`` and ``DatasetInfo`` both inherit from ``dict``, so all existing +``meta['year']``, ``meta.get('jobs')``, and ``dataset_metadata()['name']`` calls +continue to work unchanged. Structured access (``info.domain_metadata``, +``info.model_features``, ``DatasetInfo.card()``, etc.) is purely additive. + +Inspired by: +- HuggingFace ``datasets.DatasetInfo`` and ``Features``/``Value`` +- TensorFlow Datasets ``DatasetInfo``, ``FeatureConnector``, and ``BuilderConfig`` +- MLCommons Croissant 1.0 (JSON-LD metadata standard) +- Global Benchmark Database (GBD) feature records +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union + + +# --------------------------------------------------------------------------- +# Constants — keys that partition the flat instance metadata dict +# --------------------------------------------------------------------------- + +# System-level keys added by instance_metadata() — not domain metadata +_SYSTEM_KEYS: frozenset = frozenset({"dataset", "category", "name", "path"}) + +# Fields produced by extract_model_features() (requires full CPMpy model parse) +_MODEL_FEATURE_FIELDS: frozenset = frozenset({ + "num_variables", "num_bool_variables", "num_int_variables", + "num_constraints", "constraint_types", "has_objective", + "objective_type", "domain_size_min", "domain_size_max", "domain_size_mean", +}) + +# Prefixes for format-specific metadata (not portable across translations) +_FORMAT_SPECIFIC_PREFIXES: tuple = ("opb_", "wcnf_", "mps_", "xcsp_", "dimacs_") + + +# --------------------------------------------------------------------------- +# FieldInfo +# --------------------------------------------------------------------------- + +@dataclass +class FieldInfo: + """ + Schema declaration for a single domain metadata field. + + Inspired by HuggingFace ``Value`` and TFDS ``FeatureConnector``, but + intentionally simpler — no serialisation semantics needed for CO benchmarks. + + Parameters + ---------- + dtype: + Data type string: ``"int"``, ``"float"``, ``"str"``, ``"bool"``, + ``"dict"``, or ``"list"``. + description: + Human-readable description of the field. + nullable: + Whether the field may be absent / ``None`` for some instances. + example: + Optional example value (used in documentation / cards). + """ + + dtype: str + description: str = "" + nullable: bool = True + example: Any = None + + # Maps internal dtype strings → schema.org types (for Croissant export) + _DTYPE_TO_SCHEMA_ORG: Dict[str, str] = None # populated below as class var + + def schema_org_type(self) -> str: + """Return the schema.org dataType string for use in Croissant fields.""" + return _DTYPE_TO_SCHEMA_ORG.get(self.dtype, "sc:Text") + + @classmethod + def coerce(cls, value: Any) -> "FieldInfo": + """ + Normalise shorthand input into a :class:`FieldInfo`. + + Accepted forms: + + - ``FieldInfo(...)`` — returned as-is + - ``"int"`` — treated as ``FieldInfo(dtype="int")`` + - ``("int", "desc")`` — ``FieldInfo(dtype="int", description="desc")`` + - ``("int", "desc", False)`` — adds ``nullable=False`` + """ + if isinstance(value, cls): + return value + if isinstance(value, str): + return cls(dtype=value) + if isinstance(value, tuple): + return cls(*value) + raise TypeError( + f"Cannot coerce {value!r} to FieldInfo. " + "Use a FieldInfo, a dtype string, or a (dtype, description[, nullable]) tuple." + ) + + def to_dict(self) -> dict: + """Serialisable plain dict (for JSON sidecar storage).""" + d = {"dtype": self.dtype, "description": self.description, "nullable": self.nullable} + if self.example is not None: + d["example"] = self.example + return d + + +# Class-level constant (defined after the class to avoid dataclass conflicts) +_DTYPE_TO_SCHEMA_ORG: Dict[str, str] = { + "int": "sc:Integer", + "float": "sc:Float", + "str": "sc:Text", + "bool": "sc:Boolean", + "dict": "sc:StructuredValue", + "list": "sc:ItemList", +} + + +# --------------------------------------------------------------------------- +# FeaturesInfo +# --------------------------------------------------------------------------- + +class FeaturesInfo: + """ + Schema for all domain metadata fields of a dataset. + + Analogous to HuggingFace ``Features`` or TFDS ``FeatureConnector`` trees, + but without serialisation encoding — purely declarative. + + The constructor accepts a plain ``dict`` whose values are anything accepted + by :meth:`FieldInfo.coerce`: + + .. code-block:: python + + # Minimal — just type strings + FeaturesInfo({"jobs": "int", "machines": "int"}) + + # With descriptions + FeaturesInfo({"jobs": ("int", "Number of jobs")}) + + # Full control where needed + FeaturesInfo({ + "jobs": ("int", "Number of jobs"), + "optimum": FieldInfo("int", "Known optimal makespan", nullable=True), + }) + """ + + def __init__(self, fields: Dict[str, Any]): + self.fields: Dict[str, FieldInfo] = { + k: FieldInfo.coerce(v) for k, v in fields.items() + } + + def __repr__(self) -> str: + return f"FeaturesInfo({self.fields!r})" + + def __or__(self, other: "FeaturesInfo") -> "FeaturesInfo": + """ + Merge two :class:`FeaturesInfo` schemas, with ``other`` taking + precedence for any duplicate field names. + + Follows the same convention as Python's ``dict | dict`` (Python 3.9+). + + .. code-block:: python + + # Explicit merge — useful when you want full control: + class MyJSPDataset(JSPLibDataset): + features = JSPLibDataset.features | FeaturesInfo({"difficulty": "float"}) + """ + merged = FeaturesInfo.__new__(FeaturesInfo) + merged.fields = {**self.fields, **other.fields} + return merged + + def validate(self, domain_metadata: dict) -> List[str]: + """ + Validate a domain_metadata dict against this schema. + + Returns a list of error strings (empty list = valid). + """ + errors = [] + for name, fi in self.fields.items(): + if not fi.nullable and name not in domain_metadata: + errors.append(f"Required field '{name}' missing from domain_metadata") + return errors + + def to_croissant_fields(self) -> List[dict]: + """ + Generate a list of Croissant ``cr:Field`` dicts for use in a + ``cr:RecordSet``. + """ + result = [] + for name, fi in self.fields.items(): + cr_field: Dict[str, Any] = { + "@type": "cr:Field", + "name": name, + "dataType": _DTYPE_TO_SCHEMA_ORG.get(fi.dtype, "sc:Text"), + } + if fi.description: + cr_field["description"] = fi.description + result.append(cr_field) + return result + + def to_dict(self) -> dict: + """Serialisable plain dict (for JSON sidecar storage).""" + return {name: fi.to_dict() for name, fi in self.fields.items()} + + @classmethod + def from_dict(cls, d: dict) -> "FeaturesInfo": + """Reconstruct from the serialised plain dict produced by :meth:`to_dict`.""" + return cls({ + name: FieldInfo( + dtype=v.get("dtype", "str"), + description=v.get("description", ""), + nullable=v.get("nullable", True), + example=v.get("example"), + ) + for name, v in d.items() + }) + + +# --------------------------------------------------------------------------- +# InstanceInfo +# --------------------------------------------------------------------------- + +class InstanceInfo(dict): + """ + Per-instance metadata dict with structured access. + + Inherits from ``dict`` for full backward compatibility — all existing + ``meta['year']``, ``meta.get('jobs')``, ``for k, v in meta.items()`` + usage continues unchanged. + + Structured access is additive: + + .. code-block:: python + + file, info = dataset[0] + + # Backward-compatible dict access (unchanged): + info['year'] + info.get('jobs', 0) + + # New structured properties: + info.id # "jsplib/abz5" + info.domain_metadata # {"jobs": 10, "machines": 5, ...} + info.model_features # {"num_variables": 100, ...} + info.format_metadata # {"opb_num_variables": 12, ...} + + # Standards converters: + info.to_croissant_example() + info.to_gbd_features() + """ + + @property + def id(self) -> str: + """ + Stable instance identifier. + + Format: ``"dataset/cat_val1/cat_val2/.../instance_name"`` + + Example: ``"xcsp3/2024/CSP/AverageAvoiding-20_c24"`` + """ + parts = [str(self.get("dataset", ""))] + cat = self.get("category", {}) + if isinstance(cat, dict): + parts += [str(v) for v in cat.values()] + parts.append(str(self.get("name", ""))) + return "/".join(p for p in parts if p) + + @property + def name(self) -> str: + """Human-readable instance name.""" + return self.get("name", "") + + @property + def dataset(self) -> str: + """Parent dataset name.""" + return self.get("dataset", "") + + @property + def category(self) -> dict: + """Category dict (year, track, variant, family, …).""" + return self.get("category", {}) + + @property + def domain_metadata(self) -> dict: + """ + Domain-specific metadata fields. + + These are format-independent, problem-level fields such as + ``jobs``, ``machines``, ``optimum``, ``horizon``, ``num_staff``, etc. + + Excludes system keys, CP model statistics, and format-specific fields. + """ + return { + k: v for k, v in self.items() + if k not in _SYSTEM_KEYS + and k not in _MODEL_FEATURE_FIELDS + and not any(k.startswith(p) for p in _FORMAT_SPECIFIC_PREFIXES) + } + + @property + def model_features(self) -> dict: + """ + CP model statistics extracted via ``collect_features()``. + + Fields: ``num_variables``, ``num_bool_variables``, ``num_int_variables``, + ``num_constraints``, ``constraint_types``, ``has_objective``, + ``objective_type``, ``domain_size_min``, ``domain_size_max``, + ``domain_size_mean``. + """ + return {k: v for k, v in self.items() if k in _MODEL_FEATURE_FIELDS} + + @property + def format_metadata(self) -> dict: + """ + Format-specific metadata fields (``opb_*``, ``wcnf_*``, ``mps_*``, …). + + These are not portable across format translations. + """ + return { + k: v for k, v in self.items() + if any(k.startswith(p) for p in _FORMAT_SPECIFIC_PREFIXES) + } + + def without_format(self) -> "InstanceInfo": + """ + Return a copy with all format-specific metadata removed. + + Use when changing representation format. Chain with ``|`` to add new + format fields, or use as-is to just strip: + + .. code-block:: python + + # Strip and add new format fields + return opb_bytes, info.without_format() | extract_opb_features(opb_bytes) + return opb_bytes, info.without_format() | {"opb_num_variables": 47} + + # Just strip + return opb_bytes, info.without_format() + + # Simple addition without format change (most common) + return data, info | {"difficulty": 0.8} + """ + return InstanceInfo({k: v for k, v in self.items() + if not any(k.startswith(p) for p in _FORMAT_SPECIFIC_PREFIXES)}) + + def __or__(self, other: dict) -> "InstanceInfo": + return InstanceInfo(super().__or__(other)) + + def __ror__(self, other: dict) -> "InstanceInfo": + return InstanceInfo(super().__ror__(other)) + + def to_croissant_example(self) -> dict: + """ + Convert to a Croissant-compatible example record. + + Returns a flat dict with ``id``, domain metadata, and model features. + """ + record: dict = {"id": self.id} + record.update(self.domain_metadata) + record.update(self.model_features) + return record + + def to_gbd_features(self) -> dict: + """ + Convert to a GBD-style (Global Benchmark Database) feature record. + + GBD uses hash-based instance IDs; here we use the path-based ``.id`` + property as a stable identifier instead. + """ + record: dict = { + "id": self.id, + "filename": self.get("name", ""), + "dataset": self.get("dataset", ""), + } + record.update(self.category) + record.update(self.domain_metadata) + record.update(self.model_features) + return record + + +# --------------------------------------------------------------------------- +# DatasetInfo +# --------------------------------------------------------------------------- + +# CP model feature fields documented in dataset cards +_MODEL_FEATURE_DOCS = [ + ("num_variables", "int", "Total number of decision variables"), + ("num_bool_variables", "int", "Number of Boolean variables"), + ("num_int_variables", "int", "Number of integer variables"), + ("num_constraints", "int", "Total number of constraints"), + ("constraint_types", "dict", 'Map: constraint type name → count (e.g. ``{"==": 50, "no_overlap": 3}``)'), + ("has_objective", "bool", "Whether the instance has an objective function"), + ("objective_type", "str", '``"min"``, ``"max"``, or ``"none"``'), + ("domain_size_min", "int", "Minimum variable domain size"), + ("domain_size_max", "int", "Maximum variable domain size"), + ("domain_size_mean", "float", "Mean variable domain size"), +] + +# schema.org types for model feature fields (for Croissant export) +_MODEL_FEATURE_SCHEMA_ORG = { + "num_variables": "sc:Integer", "num_bool_variables": "sc:Integer", + "num_int_variables": "sc:Integer", "num_constraints": "sc:Integer", + "constraint_types": "sc:StructuredValue", "has_objective": "sc:Boolean", + "objective_type": "sc:Text", "domain_size_min": "sc:Integer", + "domain_size_max": "sc:Integer", "domain_size_mean": "sc:Float", +} + + +class DatasetInfo(dict): + """ + Dataset-level metadata dict with structured access and export methods. + + Inherits from ``dict`` for full backward compatibility — existing + ``dataset_metadata()['name']`` access continues unchanged. + + Structured properties (``version``, ``license``, ``tags``, ``domain``, + ``language``, ``features``) and methods (:meth:`card`, :meth:`to_croissant`) + are additive. + + Analogous to HuggingFace ``DatasetInfo`` and TFDS ``DatasetInfo``. + """ + + # -- Structured properties ------------------------------------------------ + + @property + def name(self) -> str: + return self.get("name", "") + + @property + def description(self) -> str: + return self.get("description", "") + + @property + def url(self) -> str: + """Homepage URL (backward-compat alias for :attr:`homepage`).""" + return self.get("url", "") or self.get("homepage", "") + + @property + def homepage(self) -> str: + """Homepage URL (HuggingFace / TFDS naming convention).""" + return self.get("homepage", "") or self.get("url", "") + + @property + def version(self) -> Optional[str]: + return self.get("version") + + @property + def license(self) -> Optional[Union[str, List[str]]]: + return self.get("license") + + @property + def domain(self) -> str: + """Primary problem domain (e.g. ``"scheduling"``, ``"sat"``, ``"cp"``).""" + return self.get("domain", "constraint_programming") + + @property + def tags(self) -> List[str]: + return self.get("tags", []) + + @property + def language(self) -> Optional[str]: + """ + Problem format / modelling language (e.g. ``"XCSP3"``, ``"OPB"``, ``"JSPLib"``). + + Analogous to HuggingFace's ``language`` field, but for CO format languages + rather than human languages. + """ + return self.get("language") + + @property + def features(self) -> Optional[FeaturesInfo]: + """ + Schema for domain metadata fields. + + Reconstructed from the serialised dict stored in the ``"features"`` key, + so this property works whether the DatasetInfo was created programmatically + or loaded from a JSON sidecar. + """ + raw = self.get("features") + if raw is None: + return None + if isinstance(raw, FeaturesInfo): + return raw + if isinstance(raw, dict): + return FeaturesInfo.from_dict(raw) + return None + + @property + def release_notes(self) -> Optional[Dict[str, str]]: + """ + Version changelog dict: ``{version_string: description}``. + + Inspired by TFDS ``BuilderConfig.release_notes``. + + Example:: + + release_notes = { + "1.0.0": "Initial release.", + "1.1.0": "Added 2024 track instances.", + } + """ + return self.get("release_notes") + + # -- Card generation ------------------------------------------------------ + + def card(self, format: str = "markdown") -> str: + """ + Generate a dataset card. + + Follows the HuggingFace Hub convention: a YAML frontmatter block + (machine-readable) followed by a markdown body (human-readable). + Sections are omitted gracefully when data is absent. + + Parameters + ---------- + format: + Only ``"markdown"`` is supported currently. + + Returns + ------- + str + The dataset card as a string. + """ + lines: List[str] = [] + + # --- YAML frontmatter (HuggingFace convention) --- + lines.append("---") + lines.append(f"name: {self.name}") + if self.version: + lines.append(f"version: {self.version}") + lic = self.license + if lic: + if isinstance(lic, list): + lines.append("license:") + for entry in lic: + lines.append(f" - {entry}") + else: + lines.append(f"license: {lic}") + if self.tags: + lines.append("tags:") + for tag in self.tags: + lines.append(f" - {tag}") + lines.append(f"domain: {self.domain}") + if self.language: + lines.append(f"language: {self.language}") + lines.append("---") + lines.append("") + + # --- Markdown body --- + lines.append(f"# {self.name} Dataset") + lines.append("") + if self.description: + lines.append(self.description) + lines.append("") + if self.homepage: + lines.append(f"**Homepage:** {self.homepage}") + lines.append("") + + # License + if lic: + lines.append("## License") + lines.append("") + if isinstance(lic, list): + for entry in lic: + lines.append(f"- {entry}") + else: + lines.append(str(lic)) + lines.append("") + + # Citation + citations = self.get("citation", []) + if citations: + lines.append("## Citation") + lines.append("") + for c in citations: + lines.append(f"- {c}") + lines.append("") + + # Changelog + rn = self.release_notes + if rn: + lines.append("## Changelog") + lines.append("") + for ver, note in rn.items(): + lines.append(f"- **{ver}**: {note}") + lines.append("") + + # Instance features (domain metadata schema) + features = self.features + if features and features.fields: + lines.append("## Instance Features (Domain Metadata)") + lines.append("") + lines.append("| Field | Type | Nullable | Description |") + lines.append("|-------|------|----------|-------------|") + for fname, fi in features.fields.items(): + nullable_str = "Yes" if fi.nullable else "No" + lines.append(f"| `{fname}` | {fi.dtype} | {nullable_str} | {fi.description} |") + lines.append("") + + # CP model features (always documented) + lines.append("## CP Model Features (from `collect_features()`)") + lines.append("") + lines.append("| Field | Type | Description |") + lines.append("|-------|------|-------------|") + for fname, ftype, fdesc in _MODEL_FEATURE_DOCS: + lines.append(f"| `{fname}` | {ftype} | {fdesc} |") + lines.append("") + + # Usage example + lines.append("## Usage") + lines.append("") + lines.append("```python") + # Best-effort class name guess from dataset name + class_guess = self.name.replace("-", "_").title().replace("_", "") + "Dataset" + lines.append(f"from cpmpy.tools.datasets import {class_guess}") + lines.append(f'dataset = {class_guess}(root="./data", download=True)') + lines.append("for instance, info in dataset:") + lines.append(" print(info.name, info.domain_metadata)") + lines.append("```") + lines.append("") + + return "\n".join(lines) + + # -- Croissant export ----------------------------------------------------- + + def to_croissant(self) -> dict: + """ + Generate a Croissant-compatible JSON-LD dataset metadata document. + + Follows the `MLCommons Croissant 1.0 + `_ specification. + + Returns + ------- + dict + A JSON-serialisable dict representing the Croissant document. + Pass to ``json.dumps()`` to get the JSON string. + """ + doc: Dict[str, Any] = { + "@context": { + "@vocab": "https://schema.org/", + "cr": "http://mlcommons.org/croissant/1.0", + "sc": "https://schema.org/", + }, + "@type": "sc:Dataset", + "name": self.name, + "description": self.description, + "url": self.homepage, + } + + if self.version: + doc["version"] = self.version + lic = self.license + if lic: + doc["license"] = lic if isinstance(lic, str) else ", ".join(lic) + if self.tags: + doc["keywords"] = self.tags + + citations = self.get("citation", []) + if citations: + doc["citation"] = "\n".join(citations) + + # Build RecordSet: id + name + path + domain fields + model features + cr_fields: List[dict] = [ + {"@type": "cr:Field", "name": "id", "dataType": "sc:Text", + "description": "Stable instance identifier (dataset/category/name)"}, + {"@type": "cr:Field", "name": "name", "dataType": "sc:Text", + "description": "Instance name"}, + {"@type": "cr:Field", "name": "path", "dataType": "sc:Text", + "description": "File path"}, + ] + + features = self.features + if features: + cr_fields.extend(features.to_croissant_fields()) + + # Standard CP model feature fields + for fname, fdesc in [(k, d) for k, _, d in _MODEL_FEATURE_DOCS]: + cr_fields.append({ + "@type": "cr:Field", + "name": fname, + "dataType": _MODEL_FEATURE_SCHEMA_ORG.get(fname, "sc:Text"), + "description": fdesc, + }) + + doc["cr:recordSet"] = [{ + "@type": "cr:RecordSet", + "name": "instances", + "cr:field": cr_fields, + }] + + return doc + + +# --------------------------------------------------------------------------- +# Standalone adapter functions (for use as target_transform) +# --------------------------------------------------------------------------- + +def to_croissant_example(metadata: dict) -> dict: + """ + Convert instance metadata to a Croissant example record. + + Usable as a ``target_transform``:: + + from cpmpy.tools.datasets.metadata import to_croissant_example + dataset = JSPLibDataset(root="data", target_transform=to_croissant_example) + for instance, record in dataset: + print(record["id"], record["jobs"]) + """ + return InstanceInfo(metadata).to_croissant_example() + + +def to_gbd_features(metadata: dict) -> dict: + """ + Convert instance metadata to a GBD-style feature record. + + Usable as a ``target_transform``:: + + from cpmpy.tools.datasets.metadata import to_gbd_features + dataset = JSPLibDataset(root="data", target_transform=to_gbd_features) + for instance, record in dataset: + print(record["id"], record["num_constraints"]) + """ + return InstanceInfo(metadata).to_gbd_features() diff --git a/cpmpy/tools/datasets/miplib.py b/cpmpy/tools/datasets/miplib.py index 8848522fa..6dab416a9 100644 --- a/cpmpy/tools/datasets/miplib.py +++ b/cpmpy/tools/datasets/miplib.py @@ -21,13 +21,19 @@ class MIPLibDataset(FileDataset): # torch.utils.data.Dataset compatible More information on MIPLib can be found here: https://miplib.zib.de/ """ - + name = "miplib" description = "Mixed Integer Programming Library benchmark instances." - url = "https://miplib.zib.de/" + homepage = "https://miplib.zib.de/" citation = [ "Gleixner, A., Hendel, G., Gamrath, G., Achterberg, T., Bastubbe, M., Berthold, T., Christophel, P. M., Jarck, K., Koch, T., Linderoth, J., Lubbecke, M., Mittelmann, H. D., Ozyurt, D., Ralphs, T. K., Salvagnin, D., and Shinano, Y. MIPLIB 2017: Data-Driven Compilation of the 6th Mixed-Integer Programming Library. Mathematical Programming Computation, 2021. https://doi.org/10.1007/s12532-020-00194-3.", ] + + version = "2017" + license = "CC BY 4.0" + domain = "mip" + tags = ["optimization", "mixed-integer-programming", "mip", "combinatorial"] + language = "MPS" def __init__( diff --git a/cpmpy/tools/datasets/mse.py b/cpmpy/tools/datasets/mse.py index 65fb346a7..de1373fec 100644 --- a/cpmpy/tools/datasets/mse.py +++ b/cpmpy/tools/datasets/mse.py @@ -7,14 +7,15 @@ import os import lzma -from typing import List, Optional +from typing import Optional import zipfile import pathlib import io import cpmpy as cp from cpmpy.tools.io.wcnf import load_wcnf -from cpmpy.tools.datasets._base import FileDataset, classproperty +from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.metadata import FeaturesInfo class MaxSATEvalDataset(FileDataset): # torch.utils.data.Dataset compatible @@ -22,34 +23,35 @@ class MaxSATEvalDataset(FileDataset): # torch.utils.data.Dataset compatible """ MaxSAT Evaluation benchmark dataset. - Provides access to benchmark instances from the MaxSAT Evaluation - competitions. Instances are grouped by `year` and `track` (e.g., - `"exact-unweighted"`, `"exact-weighted"`) and stored as `.wcnf.xz` files. - If the dataset is not available locally, it can be automatically + Provides access to benchmark instances from the MaxSAT Evaluation + competitions. Instances are grouped by `year` and `track` (e.g., + `"exact-unweighted"`, `"exact-weighted"`) and stored as `.wcnf.xz` files. + If the dataset is not available locally, it can be automatically downloaded and extracted. More information on the competition can be found here: https://maxsat-evaluations.github.io/ """ - - # -------------------------- Dataset-level metadata -------------------------- # - - _metadata_init_kwargs = {"year": 2024, "track": "exact-unweighted"} - - @classproperty - def name(self) -> str: - return "maxsateval" - @classproperty - def description(self) -> str: - return "MaxSAT Evaluation competition benchmark instances." - - @classproperty - def url(self) -> str: - return "https://maxsat-evaluations.github.io/" + # -------------------------- Dataset-level metadata -------------------------- # - @classproperty - def citation(self) -> List[str]: - return [] + name = "maxsateval" + description = "MaxSAT Evaluation competition benchmark instances." + homepage = "https://maxsat-evaluations.github.io/" + citation = [] + + version = "2024" + license = "competition-specific" + domain = "max_sat" + tags = ["optimization", "max-sat", "weighted-max-sat", "wcnf"] + language = "WCNF" + features = FeaturesInfo({ + "wcnf_num_variables": ("int", "Number of propositional variables"), + "wcnf_num_clauses": ("int", "Total number of clauses (hard + soft)"), + "wcnf_num_hard_clauses": ("int", "Number of hard clauses"), + "wcnf_num_soft_clauses": ("int", "Number of soft clauses"), + "wcnf_total_literals": ("int", "Total number of literals across all clauses"), + "wcnf_num_distinct_weights": ("int", "Number of distinct soft clause weights"), + }) # ---------------------------------------------------------------------------- # diff --git a/cpmpy/tools/datasets/nurserostering.py b/cpmpy/tools/datasets/nurserostering.py index 9f900ed12..918b79c35 100644 --- a/cpmpy/tools/datasets/nurserostering.py +++ b/cpmpy/tools/datasets/nurserostering.py @@ -15,6 +15,7 @@ import cpmpy as cp from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.metadata import FeaturesInfo # Optional dependencies try: @@ -40,7 +41,7 @@ class NurseRosteringDataset(FileDataset): # torch.utils.data.Dataset compatible name = "nurserostering" description = "Nurse rostering benchmark instances from schedulingbenchmarks.org." - url = "https://schedulingbenchmarks.org/nrp/" + homepage = "https://schedulingbenchmarks.org/nrp/" citation = [ "Strandmark, P., Qu, Y. and Curtois, T. First-order linear programming in a column generation-based heuristic approach to the nurse rostering problem. Computers & Operations Research, 2020. 120, p. 104945.", "Demirovic, E., Musliu, N., and Winter, F. Modeling and solving staff scheduling with partial weighted maxSAT. Annals of Operations Research, 2019. 275(1): p. 79-99.", @@ -48,6 +49,17 @@ class NurseRosteringDataset(FileDataset): # torch.utils.data.Dataset compatible "Rahimian, E., Akartunali, K., and Levine, J. A hybrid integer programming and variable neighbourhood search algorithm to solve nurse rostering problems. European Journal of Operational Research, 2017. 258(2): p. 411-423.", ] + version = "1.0.0" + license = "academic-use" + domain = "scheduling" + tags = ["satisfaction", "nurse-rostering", "scheduling", "timetabling"] + language = "NRP-XML" + features = FeaturesInfo({ + "horizon": ("int", "Planning horizon in days"), + "num_staff": ("int", "Number of nurses / staff members"), + "num_shifts": ("int", "Number of distinct shift types"), + }) + def __init__(self, root: str = ".", transform=None, target_transform=None, download:bool=False, sort_key=None, metadata_workers: int = 1): """ Initialize the Nurserostering Dataset. diff --git a/cpmpy/tools/datasets/opb.py b/cpmpy/tools/datasets/opb.py index 4df23d6cc..1414af6e1 100644 --- a/cpmpy/tools/datasets/opb.py +++ b/cpmpy/tools/datasets/opb.py @@ -12,16 +12,17 @@ import io from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.metadata import FeaturesInfo, FieldInfo -class OPBDataset(FileDataset): +class OPBDataset(FileDataset): """ Pseudo Boolean Competition (PB) benchmark dataset. - Provides access to benchmark instances from the Pseudo Boolean - competitions. Instances are grouped by `year` and `track` (e.g., - `"OPT-LIN"`, `"DEC-LIN"`) and stored as `.opb.xz` files. - If the dataset is not available locally, it can be automatically + Provides access to benchmark instances from the Pseudo Boolean + competitions. Instances are grouped by `year` and `track` (e.g., + `"OPT-LIN"`, `"DEC-LIN"`) and stored as `.opb.xz` files. + If the dataset is not available locally, it can be automatically downloaded and extracted. More information on the competition can be found here: https://www.cril.univ-artois.fr/PB25/ @@ -29,7 +30,22 @@ class OPBDataset(FileDataset): name = "opb" description = "Pseudo-Boolean Competition benchmark instances." - url = "https://www.cril.univ-artois.fr/PB25/" + homepage = "https://www.cril.univ-artois.fr/PB25/" + citation = [ + "Berre, D. L., Parrain, A. The Pseudo-Boolean Evaluation 2011. JSAT, 7(1), 2012.", + ] + + version = "2024" + license = "competition-specific" + domain = "pseudo_boolean" + tags = ["optimization", "pseudo-boolean", "opb", "combinatorial"] + language = "OPB" + features = FeaturesInfo({ + "author": ("str", "Author extracted from filename convention"), + "opb_num_variables": ("int", "Number of Boolean variables (from OPB header)"), + "opb_num_constraints": ("int", "Number of constraints (from OPB header)"), + "opb_num_products": FieldInfo("int", "Number of non-linear product terms (from OPB header)", nullable=True), + }) def __init__( self, diff --git a/cpmpy/tools/datasets/psplib.py b/cpmpy/tools/datasets/psplib.py index e2fec4496..74a7604e0 100644 --- a/cpmpy/tools/datasets/psplib.py +++ b/cpmpy/tools/datasets/psplib.py @@ -10,17 +10,39 @@ import zipfile from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.metadata import FeaturesInfo, FieldInfo + class PSPLibDataset(FileDataset): # torch.utils.data.Dataset compatible """ PSPlib Dataset in a PyTorch compatible format. - + More information on PSPlib can be found here: https://www.om-db.wi.tum.de/psplib/main.html """ - + name = "psplib" description = "Project Scheduling Problem Library (RCPSP) benchmark instances." - url = "https://www.om-db.wi.tum.de/psplib/main.html" + homepage = "https://www.om-db.wi.tum.de/psplib/main.html" + citation = [ + "Kolisch, R., Sprecher, A. PSPLIB - A project scheduling problem library. European Journal of Operational Research, 96(1), 205-216, 1997.", + ] + + version = "1.0.0" + license = "academic-use" + domain = "scheduling" + tags = ["optimization", "project-scheduling", "rcpsp", "scheduling", "combinatorial"] + language = "PSPLIB" + features = FeaturesInfo({ + "num_jobs": ("int", "Number of jobs (activities) in the project"), + "horizon": ("int", "Planning horizon (maximum makespan upper bound)"), + "num_renewable_resources": ("int", "Number of renewable resource types"), + "num_nonrenewable_resources": FieldInfo("int", "Number of non-renewable resource types", nullable=True), + "num_doubly_constrained_resources":FieldInfo("int", "Number of doubly-constrained resource types", nullable=True), + "duedate": FieldInfo("int", "Project due date", nullable=True), + "tardcost": FieldInfo("int", "Tardiness cost per unit time", nullable=True), + "mpm_time": FieldInfo("int", "Minimum project makespan (MPM)", nullable=True), + "resource_availabilities": FieldInfo("list", "Available units per resource type", nullable=True), + }) def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False, metadata_workers: int = 1): diff --git a/cpmpy/tools/datasets/transforms.py b/cpmpy/tools/datasets/transforms.py index 8421d022f..2fab520be 100644 --- a/cpmpy/tools/datasets/transforms.py +++ b/cpmpy/tools/datasets/transforms.py @@ -127,6 +127,15 @@ def extract_format_metadata(content, format_name): return result +def _enrich_from_model(model, metadata): + """Merge model features into an InstanceInfo, returning the updated copy.""" + from cpmpy.tools.datasets.metadata import InstanceInfo + new_fields = metadata_from_model(model) + if isinstance(metadata, InstanceInfo): + return metadata | new_fields + return InstanceInfo({**metadata, **new_fields}) + + def metadata_from_model(model): """Add decision variable and objective info from a CPMpy Model to metadata. diff --git a/cpmpy/tools/datasets/xcsp3.py b/cpmpy/tools/datasets/xcsp3.py index a947c9b3a..d7aaef91c 100644 --- a/cpmpy/tools/datasets/xcsp3.py +++ b/cpmpy/tools/datasets/xcsp3.py @@ -12,13 +12,14 @@ import cpmpy as cp from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.metadata import FeaturesInfo class XCSP3Dataset(FileDataset): # torch.utils.data.Dataset compatible """ XCSP3 Dataset in a PyTorch compatible format. - + Arguments: root (str): Root directory containing the XCSP3 instances (if 'download', instances will be downloaded to this location) year (int): Competition year (2022, 2023 or 2024) @@ -30,7 +31,20 @@ class XCSP3Dataset(FileDataset): # torch.utils.data.Dataset compatible name = "xcsp3" description = "XCSP3 competition benchmark instances for constraint satisfaction and optimization." - url = "https://xcsp.org/instances/" + homepage = "https://xcsp.org/instances/" + citation = [ + "Audemard, G., Boussemart, F., Lecoutre, C., Piette, C., Tabary, S. XCSP3: An Integrated Format for Benchmarking Combinatorial Constrained Problems. arXiv:2009.00514, 2020.", + ] + + version = "2024" + license = "competition-specific" + domain = "constraint_programming" + tags = ["satisfaction", "optimization", "xcsp3", "combinatorial"] + language = "XCSP3" + features = FeaturesInfo({ + "instance_type": ("str", "Problem type declared in the XML root element (CSP, COP, WCSP, …)"), + "xcsp_format": ("str", "XCSP format version string from the XML header"), + }) def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transform=None, target_transform=None, download: bool = False, metadata_workers: int = 1): From a1b421f68bdaacd806ee82b59cb4789f8bf6a78c Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Mon, 2 Mar 2026 13:59:39 +0100 Subject: [PATCH 130/152] Model objects metadata --- cpmpy/tools/datasets/metadata.py | 37 ++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/cpmpy/tools/datasets/metadata.py b/cpmpy/tools/datasets/metadata.py index 134019d25..59d4d6540 100644 --- a/cpmpy/tools/datasets/metadata.py +++ b/cpmpy/tools/datasets/metadata.py @@ -39,7 +39,13 @@ _MODEL_FEATURE_FIELDS: frozenset = frozenset({ "num_variables", "num_bool_variables", "num_int_variables", "num_constraints", "constraint_types", "has_objective", - "objective_type", "domain_size_min", "domain_size_max", "domain_size_mean", + "objective_type", "objective", "objective_is_min", + "domain_size_min", "domain_size_max", "domain_size_mean", +}) + +# Live Python objects added by Load — not JSON-serialisable, excluded from exports +_MODEL_OBJECT_KEYS: frozenset = frozenset({ + "decision_variables", }) # Prefixes for format-specific metadata (not portable across translations) @@ -298,12 +304,14 @@ def domain_metadata(self) -> dict: These are format-independent, problem-level fields such as ``jobs``, ``machines``, ``optimum``, ``horizon``, ``num_staff``, etc. - Excludes system keys, CP model statistics, and format-specific fields. + Excludes system keys, CP model statistics, live model objects, and + format-specific fields. """ return { k: v for k, v in self.items() if k not in _SYSTEM_KEYS and k not in _MODEL_FEATURE_FIELDS + and k not in _MODEL_OBJECT_KEYS and not any(k.startswith(p) for p in _FORMAT_SPECIFIC_PREFIXES) } @@ -319,6 +327,31 @@ def model_features(self) -> dict: """ return {k: v for k, v in self.items() if k in _MODEL_FEATURE_FIELDS} + @property + def model_objects(self) -> dict: + """ + Live Python objects added by the ``Load`` transform. + + Currently contains: + + - ``decision_variables``: ``{name: CPMpy_variable}`` mapping for every + decision variable in the loaded model. + + These objects are **not JSON-serialisable** and are excluded from + ``domain_metadata``, ``to_croissant_example()``, and ``to_gbd_features()``. + They are available only in-memory after a ``Load`` transform has run. + + .. code-block:: python + + dataset.transform = Load(dataset.loader, open=dataset.open) + model, info = dataset[0] + + vars = info.model_objects["decision_variables"] + model.solve() + print({name: v.value() for name, v in vars.items()}) + """ + return {k: v for k, v in self.items() if k in _MODEL_OBJECT_KEYS} + @property def format_metadata(self) -> dict: """ From cad12280f239559b6da7ac15ad1848a5721515a2 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 5 Mar 2026 11:11:19 +0100 Subject: [PATCH 131/152] Refactor and document datasets core --- cpmpy/tools/datasets/{_base.py => core.py} | 945 +++++++++++---------- docs/api/tools.rst | 14 +- 2 files changed, 490 insertions(+), 469 deletions(-) rename cpmpy/tools/datasets/{_base.py => core.py} (70%) diff --git a/cpmpy/tools/datasets/_base.py b/cpmpy/tools/datasets/core.py similarity index 70% rename from cpmpy/tools/datasets/_base.py rename to cpmpy/tools/datasets/core.py index 6320a5e6e..9c5f3ac0b 100644 --- a/cpmpy/tools/datasets/_base.py +++ b/cpmpy/tools/datasets/core.py @@ -1,8 +1,12 @@ """ -Dataset Base Class +Dataset Base Classes + +This module defines multiple abstract datasets, a hierarchy of classes which together +serve as the foundation for competition and application oriented benchmarking datasets. + +They enable the loading and managing of well-known benchmark instance collections +from the Constraint Optimisation (CO) community. -This module defines the abstract `_Dataset` class, which serves as the foundation -for loading and managing benchmark instance collections in CPMpy-based experiments. It standardizes how datasets are downloaded, stored, accessed, and optionally transformed. It provides a Pytorch compatible interface (constructor arguments like "transform" and the @@ -10,6 +14,84 @@ Additionaly, it provides a collection of methods and helper functions to adapt the dataset to the specific usecase requirements of constraint optimisation benchmarks. + +To implement a new dataset, one needs to subclass one of the abstract dataset classes, +and provide implementation for the following methods: +- _loader: loads a CPMpy model from a string representation of the instance (file) +- category: return a dictionary of category labels, describing to which subset the dataset has been restricted (year, track, ...) +- download: download the dataset (helper function :func:`_download_file` is provided) + +Some optional methods to overwrite are: +- collect_instance_metadata: collect metadata about individual instances (e.g. number of variables, constraints, ...), potentially domain specific +- open: how to open the instance file (e.g. for compressed files, use .xz, .lzma, .gz, ...) + +Datasets must also implement the following dataset metadata attributes: +- name: the name of the dataset +- description: a short description of the dataset +- homepage: a URL to the homepage of the dataset +- citation: a list of citations for the dataset + +Optional dataset schema metadata: +- features: a :class:`FeaturesInfo` schema describing domain-level instance fields + (for example ``jobs``, ``machines``, ``optimum``, ``horizon``). + This schema is exposed in dataset-level metadata and used by dataset cards and + export formats (e.g. Croissant) to document the meaning and types of fields in + instance metadata. + +``features`` is optional. Default behavior is ``features = None``: +- dataset cards are still generated, but the "Instance Features (Domain Metadata)" + section is omitted. +- Croissant export is still generated with core fields (``id``, ``name``, ``path``) + and standard CP model feature fields; only domain-specific schema fields from + ``features`` are omitted. +- instance metadata collection and loading behavior are unchanged; ``features`` + only documents schema and export metadata. + +Feature inheritance and extension: +- child dataset classes may declare only new fields in ``features``; these are + merged with inherited fields from the nearest ancestor defining ``features``. +- child fields override inherited fields with the same name. +- to use a completely custom schema, define the full ``features`` object in the + child class. + +All parts for which an implementation must be provided are marked with an @abstractmethod decorator, +raising a NotImplementedError if not overwritten. + +Datasets files should be downloaded as-is, without any preprocessing or decompression. Upon initial download, +instance-level metadata gets auto collected and stored in a JSON sidecar file. All subsequent accesses to the dataset +will use the sidecar file to avoid re-collecting the metadata. + +Iterating over the dataset is done in the same way as a PyTorch dataset. It returns 2-tuples (x,y) of: +- x: instance reference (a file path is the only supported type at the moment) +- y: metadata (solution, features, origin, etc.) + +Example: + +.. code-block:: python + + dataset = MyDataset(download=True) + for x, y in dataset: + print(x, y) + +The dataset also supports PyTorch-style transforms and target transforms. + +.. code-block:: python + from cpmpy.tools.io import load_wcnf + from cpmpy.tools.datasets.metadata import to_croissant + + dataset = MyDataset(download=True, transform=load_wcnf(x), target_transform=to_croissant) + for model, croissant_record in dataset: + ... + +For advanced operations on the datasets, like filtering, mapping, splitting, shuffling, sorting, etc., +make use of the PyTorch tooling ecosystem (thanks to our compatible interface). + +Example: +.. code-block:: python + dataset = MyDataset(download=True, transform=load_wcnf(x), target_transform=to_croissant) + + from torch.utils.data import random_split + train_dataset, test_dataset = random_split(dataset, [0.8, 0.2]) """ from __future__ import annotations @@ -27,7 +109,9 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed import multiprocessing -from altair.utils.schemapi import _passthrough +from libraries.cpmpy.cpmpy.tools.datasets import FeaturesInfo +from libraries.cpmpy.cpmpy.tools.datasets.metadata import DatasetInfo, InstanceInfo +from libraries.cpmpy.cpmpy.tools.datasets.utils import extract_model_features, portable_instance_metadata # tqdm as an optional dependency, provides prettier progress bars try: @@ -37,17 +121,6 @@ import cpmpy as cp -from .metadata import ( - InstanceInfo, DatasetInfo, FeaturesInfo, FieldInfo, - _MODEL_FEATURE_FIELDS, _FORMAT_SPECIFIC_PREFIXES, -) - -# Re-export constants for backward compatibility with code that imports from _base -__all__ = [ - "_MODEL_FEATURE_FIELDS", "_FORMAT_SPECIFIC_PREFIXES", - "InstanceInfo", "DatasetInfo", "FeaturesInfo", "FieldInfo", -] - def _format_bytes(bytes_num): """ @@ -74,210 +147,163 @@ def __init__(self, func): def __get__(self, instance, owner): return self.func(owner) +class Dataset(ABC): + """ + Abstract base class for CO datasets. -def portable_instance_metadata(metadata: dict) -> dict: + Each instance in a dataset is characterised by a (x, y) pair of: + x: instance reference (e.g., file path, database key, generated seed, ...) + y: metadata (solution, features, origin, etc.) """ - Filter sidecar metadata to only portable, domain-specific fields. + - Strips model features (num_variables, constraint_types, ...), - format-specific fields (opb_*, wcnf_*, mps_*, ...), and internal - error fields (starting with ``_``). + # -------------- Dataset-level metadata (override in subclasses) ------------- # - Keeps domain-specific metadata that is independent of the file format, - such as ``jobs``, ``machines``, ``optimum``, ``horizon``, ``bounds``, etc. + @classproperty + @abstractmethod + def name(self) -> str: pass - Arguments: - metadata (dict): Full sidecar metadata dictionary. + @classproperty + @abstractmethod + def description(self) -> str: pass - Returns: - dict with only portable fields. - """ - return { - k: v for k, v in metadata.items() - if not k.startswith("_") - and k not in _MODEL_FEATURE_FIELDS - and not any(k.startswith(p) for p in _FORMAT_SPECIFIC_PREFIXES) - } + @classproperty + @abstractmethod + def homepage(self) -> str: pass + @classproperty + def citation(self) -> List[str]: + return [] -def _extract_model_features(model) -> dict: - """ - Extract generic CP features from a CPMpy Model. + # OPTIONAL + features: Optional[FeaturesInfo] = None # domain_metadata field schema + + # ---------------------------------------------------------------------------- # - Arguments: - model: a cpmpy.Model instance + def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None): + """ + Arguments: + transform (callable, optional): Optional transform applied to the instance reference. + target_transform (callable, optional): Optional transform applied to the metadata. + """ + self.transform = transform + self.target_transform = target_transform - Returns: - dict with keys: num_variables, num_bool_variables, num_int_variables, - num_constraints, constraint_types, has_objective, objective_type, - domain_size_min, domain_size_max, domain_size_mean - """ - from cpmpy.transformations.get_variables import get_variables_model - from cpmpy.expressions.variables import _BoolVarImpl - from cpmpy.expressions.core import Expression - from cpmpy.expressions.utils import is_any_list - - variables = get_variables_model(model) - - num_bool = sum(1 for v in variables if isinstance(v, _BoolVarImpl)) - num_int = len(variables) - num_bool - - # Domain sizes (lb/ub available on all variable types) - domain_sizes = [int(v.ub) - int(v.lb) + 1 for v in variables] if variables else [] - - # Constraint types: collect .name from top-level constraints - constraint_type_counts = {} - - def _count_constraints(c): - if is_any_list(c): - for sub in c: - _count_constraints(sub) - elif isinstance(c, Expression): - name = c.name - constraint_type_counts[name] = constraint_type_counts.get(name, 0) + 1 - - for c in model.constraints: - _count_constraints(c) - - num_constraints = sum(constraint_type_counts.values()) - - # Objective - has_obj = model.objective_ is not None - obj_type = "none" - if has_obj: - obj_type = "min" if model.objective_is_min else "max" - - return { - "num_variables": len(variables), - "num_bool_variables": num_bool, - "num_int_variables": num_int, - "num_constraints": num_constraints, - "constraint_types": constraint_type_counts, - "has_objective": has_obj, - "objective_type": obj_type, - "domain_size_min": min(domain_sizes) if domain_sizes else None, - "domain_size_max": max(domain_sizes) if domain_sizes else None, - "domain_size_mean": round(sum(domain_sizes) / len(domain_sizes), 2) if domain_sizes else None, - } - - -def extract_model_features(model) -> dict: - """Public wrapper for extracting generic CPMpy model features.""" - return _extract_model_features(model) - - -# Global context for process-based metadata collection workers -_metadata_worker_context = {} - - -def _init_metadata_worker(context_dict, collect_metadata_func, reader_func, open_func): - """Initialize worker process with dataset context.""" - global _metadata_worker_context - _metadata_worker_context = context_dict.copy() - _metadata_worker_context['collect_instance_metadata'] = collect_metadata_func - _metadata_worker_context['reader'] = reader_func - _metadata_worker_context['open_func'] = open_func - - -def _collect_one_metadata_worker(file_path_str): - """Worker function for process-based metadata collection.""" - global _metadata_worker_context - file_path = pathlib.Path(file_path_str) - dataset_dir = pathlib.Path(_metadata_worker_context['dataset_dir']) - meta_path = dataset_dir / (file_path.name + _metadata_worker_context['metadata_extension']) - - # Collect instance metadata using the provided function - collect_metadata = _metadata_worker_context['collect_instance_metadata'] - try: - instance_meta = collect_metadata(str(file_path)) - except Exception as e: - instance_meta = {"_metadata_error": str(e)} - - # Separate portable from format-specific fields - portable = portable_instance_metadata(instance_meta) - format_specific = { - k: v for k, v in instance_meta.items() - if k not in portable and not k.startswith("_") - } - - # Derive instance name - stem = file_path.stem - for ext in (".xml", ".wcnf", ".opb"): - if stem.endswith(ext): - stem = stem[:len(stem) - len(ext)] - break - - # Build structured sidecar - sidecar = { - "dataset": _metadata_worker_context['dataset_metadata'], - "instance_name": stem, - "source_file": str(file_path.relative_to(dataset_dir)), - "category": _metadata_worker_context['category'], - "instance_metadata": portable, - "format_metadata": format_specific, - } - - if "_metadata_error" in instance_meta: - sidecar["_metadata_error"] = instance_meta["_metadata_error"] - - # Preserve or compute model features - model_features = None - if meta_path.exists(): - try: - with open(meta_path, "r") as f: - existing = json.load(f) - if "model_features" in existing: - model_features = existing["model_features"] - except (json.JSONDecodeError, IOError): - pass - - if model_features is None: - reader = _metadata_worker_context['reader'] - open_func = _metadata_worker_context['open_func'] - if not callable(reader): - raise TypeError( - f"Cannot extract model features for {file_path}: " - "no dataset reader configured." - ) - model = reader(str(file_path), open=open_func) - model_features = extract_model_features(model) - - sidecar["model_features"] = model_features + def __init_subclass__(cls, **kwargs): + """ + Auto-merge ``features`` when a subclass declares only its *new* fields. - with open(meta_path, "w") as f: - json.dump(sidecar, f, indent=2) - - return str(file_path) + If a subclass explicitly defines ``features``, it is merged with the + nearest ancestor's ``features`` so the subclass only needs to list + what is new. The subclass fields take precedence over inherited ones. + .. code-block:: python -""" -dataset.map(transform) -dataset.filter(predicate) -dataset.shuffle(seed) -dataset.split(ratio) -""" + class MyJSPDataset(JSPLibDataset): + # No need to repeat {jobs, machines, optimum, ...} — they are + # inherited and merged in automatically. + features = FeaturesInfo({"difficulty": ("float", "Computed difficulty score")}) -class Dataset(ABC): - """ - Abstract base class for datasets. + def collect_instance_metadata(self, file): + meta = super().collect_instance_metadata(file) + meta["difficulty"] = ... + return meta - Each instance in a dataset is characterised by a (x, y) pair of: - x: instance reference (e.g., file path, database key, generated seed, ...) - y: metadata (solution, features, origin, etc.) - """ + To *replace* rather than extend the parent schema, explicitly set + ``features`` to the complete schema you want (the auto-merge still + runs, but if you start from scratch the parent's fields will be + absent from the parent's FeaturesInfo and won't be merged). + Alternatively, set ``features = None`` to clear the schema entirely. + """ + super().__init_subclass__(**kwargs) + subclass_features = cls.__dict__.get("features") + if subclass_features is None: + return + # Walk the MRO to find the nearest ancestor that has features defined + for base in cls.__mro__[1:]: + parent_features = base.__dict__.get("features") + if parent_features is not None: + cls.features = parent_features | subclass_features + return - def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None): + + # ---------------------------------------------------------------------------- # + # Methods to implement in subclasses: # + # ---------------------------------------------------------------------------- # + + @abstractmethod + def instance_metadata(self, instance) -> InstanceInfo: + """ + Return the metadata for a given instance file. + + Returns an :class:`~metadata.InstanceInfo`, which is a ``dict`` subclass + so all existing ``meta['year']``, ``meta.get('jobs')`` access is unchanged. + Structured access via ``info.domain_metadata``, ``info.model_features``, + ``info.id``, etc. is additive. + """ + pass + + + # ---------------------------------------------------------------------------- # + # Public interface # + # ---------------------------------------------------------------------------- # + + @classmethod + def dataset_metadata(cls) -> DatasetInfo: + """ + Return dataset-level metadata as a :class:`~metadata.DatasetInfo`. + + :class:`~metadata.DatasetInfo` is the dataset metadata object. + It offers dict-compatible access for straightforward key-based usage + (for example ``dataset_metadata()['name']``), and also provides richer + helper methods such as ``dataset_metadata().card()`` and + ``dataset_metadata().to_croissant()``. + + Returns: + DatasetInfo: The dataset-level metadata. + """ + if isinstance(cls.citation, str): + citations = [cls.citation] if cls.citation else [] + else: + citations = list(cls.citation) + + return DatasetInfo({ + "name": cls.name, + "description": cls.description, + "homepage": cls.homepage, + "citation": citations, + "features": cls.features, + }) + + @classmethod + def card(cls, format: str = "markdown") -> str: """ + Generate a dataset card for this dataset. + + Shorthand for ``cls.dataset_metadata().card(format=format)``. + + Follows HuggingFace Hub convention: YAML frontmatter (machine-readable) + followed by a markdown body (human-readable). + Arguments: - transform (callable, optional): Optional transform applied to the instance reference. - target_transform (callable, optional): Optional transform applied to the metadata. + format (str): Only ``"markdown"`` is currently supported. + + Returns: + str: The dataset card as a string. """ - self.transform = transform - self.target_transform = target_transform + return cls.dataset_metadata().card(format=format) + class IndexedDataset(Dataset): """ Abstract base class for indexed datasets. + + Indexed datasets are datasets where the instances are indexed by a unique identifier and + can be accessed by that identifier. For example its positional index within the dataset. + + Implementing this class requires implementing the following methods: + - __len__: return the total number of instances + - __getitem__: return the instance and metadata at the given index / identifier """ @abstractmethod @@ -290,7 +316,7 @@ def __len__(self) -> int: @abstractmethod def __getitem__(self, index: int) -> Tuple[Any, Any]: """ - Return the instance and metadata at the given index. + Return the instance and metadata at the given index / identifier. Returns: x: instance reference (e.g., file path, database key, generated seed, ...) @@ -299,6 +325,9 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: pass def __iter__(self): + """ + Iterate over the dataset. + """ for i in range(len(self)): yield self[i] @@ -344,6 +373,13 @@ def expand_varying_kwargs( class IterableDataset(Dataset): """ Abstract base class for iterable datasets. + + Iterable datasets are datasets where the instances are iterable and can be accessed by an iterator. + The dataset does not provide random access to the instances through an index or identifier. + An example is a generator function that yields the instances based on a random seed. + + Implementing this class requires implementing the following method: + - __iter__: return an iterator over the dataset """ @abstractmethod @@ -368,6 +404,32 @@ def from_generator( """ Create an IterableDataset from a generator. + Wraps a Python generator function into an ``IterableDataset``. + The method determines the number of ``generator(...)`` calls and their + keyword arguments from ``gen_kwargs`` and ``vary``. + + ``gen_kwargs`` is the source of truth: + keys are parameter names of ``generator``, values are argument values. + ``vary`` selects which of these keys should be expanded. + + Behavior summary: + - ``vary is None``: + one call -> ``generator(**gen_kwargs)``. + - ``vary`` is one key (e.g. ``"n"``): + one call per value in ``gen_kwargs["n"]``, while all other + keyword arguments from ``gen_kwargs`` are passed unchanged. + - ``vary`` is multiple keys (e.g. ``["n", "seed"]``): + one call per tuple of values for those keys, while all non-varied + keyword arguments from ``gen_kwargs`` are passed unchanged. + Two options for the varying: + - ``vary_mode="zip"``: parallel iteration + - ``vary_mode="product"``: Cartesian product + + Important: + - Every key mentioned in ``vary`` must already exist in ``gen_kwargs``. + - If a key is varied, its value in ``gen_kwargs`` must be iterable. + - Non-varied keys are reused unchanged for every generator call. + Arguments: generator: Callable that returns an iterator yielding (x, y) pairs. When ``vary`` is None, called as ``generator()`` or @@ -380,6 +442,111 @@ def from_generator( from zip (default) or product of the iterables. vary_mode: When ``vary`` is a list, ``'zip'`` (parallel iteration, same-length iterables) or ``'product'`` (Cartesian product). + + Examples: + + .. code-block:: python + + def gen_graph_coloring(num_instances, n_vertices, edge_prob, seed): + import random + rng = random.Random(seed) + for i in range(num_instances): + x = { + "problem": "graph_coloring", + "n_vertices": n_vertices, + "edge_prob": edge_prob, + "instance_seed": rng.randint(0, 10**9), + } + y = {"family": "gc", "name": f"gc_{n_vertices}_{i}"} + yield x, y + + Fixed kwargs (single call): + + .. code-block:: python + + ds = IterableDataset.from_generator( + gen_graph_coloring, + gen_kwargs={ + "num_instances": 3, + "n_vertices": 40, + "edge_prob": 0.2, + "seed": 7, + }, + ) + # Calls gen_graph_coloring(...) once with fixed kwargs + + Vary one kwarg: + + .. code-block:: python + + ds = IterableDataset.from_generator( + gen_graph_coloring, + gen_kwargs={ + "num_instances": 3, + "n_vertices": 40, + "edge_prob": [0.1, 0.2, 0.3], + "seed": 7, + }, + vary="edge_prob", + ) + # Calls: + # gen_graph_coloring(..., edge_prob=0.1, ...) + # gen_graph_coloring(..., edge_prob=0.2, ...) + # gen_graph_coloring(..., edge_prob=0.3, ...) + # Other kwargs (num_instances, n_vertices, seed) stay fixed. + + Vary multiple kwargs with zip (default): + + .. code-block:: python + + def gen_rcpsp_like(num_instances, n_jobs, n_resources, tightness, seed): + import random + rng = random.Random(seed) + for i in range(num_instances): + x = { + "problem": "rcpsp", + "n_jobs": n_jobs, + "n_resources": n_resources, + "tightness": tightness, + "instance_seed": rng.randint(0, 10**9), + } + y = {"family": "rcpsp", "name": f"j{n_jobs}_r{n_resources}_{i}"} + yield x, y + + ds = IterableDataset.from_generator( + gen_rcpsp_like, + gen_kwargs={ + "num_instances": 2, + "n_jobs": [30, 60], + "n_resources": [4, 8], + "tightness": [0.6, 0.8], + "seed": 11, + }, + vary=["n_jobs", "n_resources", "tightness"], + vary_mode="zip", + ) + # Calls: + # gen_rcpsp_like(..., n_jobs=30, n_resources=4, tightness=0.6, ...) + # gen_rcpsp_like(..., n_jobs=60, n_resources=8, tightness=0.8, ...) + # Non-varied kwargs (num_instances, seed) are reused in both calls. + + Vary multiple kwargs with Cartesian product:: + + .. code-block:: python + + ds = IterableDataset.from_generator( + gen_rcpsp_like, + gen_kwargs={ + "num_instances": 1, + "n_jobs": [30, 60], + "n_resources": [4, 8], + "tightness": [0.6, 0.8], + "seed": 11, + }, + vary=["n_jobs", "n_resources", "tightness"], + vary_mode="product", + ) + # Calls all 2 x 2 x 2 = 8 combinations """ gen_kwargs = gen_kwargs or {} @@ -422,99 +589,30 @@ def __iter__(self): class FileDataset(IndexedDataset): """ - Abstract base class for PyTorch-style datasets of CO benchmarking instances. + Abstract base class for PyTorch-style datasets of file-based CO benchmarking sets. The `FileDataset` class provides a standardized interface for downloading and accessing file-backed benchmark instances. This class should not be used on its own. - Instead have a look at one of the concrete subclasses, providing access to - well-known datasets from the community. + Either have a look at one of the concrete subclasses, providing access to + well-known datasets from the community, or use this class as the base for your own dataset. + + For a more detailed authoring guide (design patterns, metadata conventions, + and implementation checklist), see :ref:`datasets_advanced_authoring`. """ # Extension for metadata sidecar files METADATA_EXTENSION = ".meta.json" - # -------------- Dataset-level metadata (override in subclasses) ------------- # - - @classproperty - @abstractmethod - def name(self) -> str: pass - - @classproperty - @abstractmethod - def description(self) -> str: pass - - @classproperty - @abstractmethod - def homepage(self) -> str: pass - - @classproperty - def citation(self) -> List[str]: - return [] - - # Optional enrichment — all have sensible defaults, zero lines required - version: Optional[str] = None # e.g. "2024", "1.0.0" - license: Optional[Union[str, List[str]]] = None # e.g. "MIT", ["CC BY 4.0"] - domain: str = "constraint_programming" # e.g. "scheduling", "sat" - tags: List[str] = [] # e.g. ["optimization", "scheduling"] - language: Optional[str] = None # e.g. "XCSP3", "OPB", "JSPLib" - features: Optional[FeaturesInfo] = None # domain_metadata field schema - release_notes: Optional[dict] = None # {version: changelog} - - # Multiple download origins (override in subclasses or via config) - # Origins are tried in order, falling back to homepage if all fail - origins: List[str] = [] - - # ---------------------------------------------------------------------------- # - - def __init_subclass__(cls, **kwargs): - """ - Auto-merge ``features`` when a subclass declares only its *new* fields. - - If a subclass explicitly defines ``features``, it is merged with the - nearest ancestor's ``features`` so the subclass only needs to list - what is new. The subclass fields take precedence over inherited ones. - - .. code-block:: python - - class MyJSPDataset(JSPLibDataset): - # No need to repeat {jobs, machines, optimum, ...} — they are - # inherited and merged in automatically. - features = FeaturesInfo({"difficulty": ("float", "Computed difficulty score")}) - - def collect_instance_metadata(self, file): - meta = super().collect_instance_metadata(file) - meta["difficulty"] = ... - return meta - - To *replace* rather than extend the parent schema, explicitly set - ``features`` to the complete schema you want (the auto-merge still - runs, but if you start from scratch the parent's fields will be - absent from the parent's FeaturesInfo and won't be merged). - Alternatively, set ``features = None`` to clear the schema entirely. - """ - super().__init_subclass__(**kwargs) - subclass_features = cls.__dict__.get("features") - if subclass_features is None: - return - # Walk the MRO to find the nearest ancestor that has features defined - for base in cls.__mro__[1:]: - parent_features = base.__dict__.get("features") - if parent_features is not None: - cls.features = parent_features | subclass_features - return - - def __init__( self, dataset_dir: str = ".", transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False, extension: str = ".txt", - metadata_workers: int = 1, **kwargs ): """ - Constructor for the _Dataset base class. + Constructor for the FileDataset base class. Arguments: dataset_dir (str): Path to the dataset directory. @@ -522,7 +620,12 @@ def __init__( target_transform (callable, optional): Optional transform applied to the metadata dictionary. download (bool): If True, downloads the dataset if it does not exist locally (default=False). extension (str): Extension of the instance files. Used to filter instance files from the dataset directory. - metadata_workers (int): Number of parallel workers for metadata collection during download (default: 1). + **kwargs: Advanced options. Currently supports: + - metadata_workers (int): Number of parallel workers for + metadata collection during initial download (default: 1). + - ignore_sidecar (bool): If True, do not read/write metadata + sidecars and collect metadata on demand at iteration time + using ``collect_instance_metadata()`` (default: False). Raises: ValueError: If the dataset directory does not exist and `download=False`, @@ -533,17 +636,17 @@ def __init__( self.dataset_dir = pathlib.Path(dataset_dir) self.extension = extension - # TODO: remove for later? - # if not self.origins: - # from cpmpy.tools.datasets.config import get_origins - # self.origins = get_origins(self.name) + # Advanced options + metadata_workers = kwargs.pop("metadata_workers", 1) + self._ignore_sidecar = kwargs.pop("ignore_sidecar", False) if not self._check_exists(): if not download: raise ValueError("Dataset not found. Please set download=True to download the dataset.") else: self.download() - self._collect_all_metadata(workers=metadata_workers) + if not self._ignore_sidecar: + self._collect_all_metadata(workers=metadata_workers) files = self._list_instances() print(f"Finished downloading {len(files)} instances") @@ -579,7 +682,7 @@ def _loader(content: str) -> cp.Model: pass @abstractmethod - def category(self) -> dict: + def categories(self) -> dict: """ Labels to distinguish instances into categories matching to those of the dataset. E.g. @@ -641,6 +744,11 @@ def read(self, instance: os.PathLike) -> str: with self.open(instance) as f: return f.read() + + # ---------------------------------------------------------------------------- # + # Public interface # + # ---------------------------------------------------------------------------- # + def load(self, instance: Union[str, os.PathLike]) -> cp.Model: """ Load a CPMpy model from an instance file. @@ -668,12 +776,7 @@ def load(self, instance: Union[str, os.PathLike]) -> cp.Model: # Loading - turn raw contents into CPMpy model return self._loader(content) - - # ---------------------------------------------------------------------------- # - # Public interface # - # ---------------------------------------------------------------------------- # - - def instance_metadata(self, file: os.PathLike) -> InstanceInfo: + def instance_metadata(self, instance: os.PathLike) -> InstanceInfo: """ Return the metadata for a given instance file. @@ -691,80 +794,25 @@ def instance_metadata(self, file: os.PathLike) -> InstanceInfo: metadata = { 'dataset': self.name, 'category': self.category(), - 'name': pathlib.Path(file).name.replace(self.extension, ''), - 'path': file, + 'name': pathlib.Path(instance).name.replace(self.extension, ''), + 'path': instance, } - # Load sidecar metadata if it exists - meta_path = self._metadata_path(file) - if meta_path.exists(): - with open(meta_path, "r") as f: - sidecar = json.load(f) - # Structured: flatten instance_metadata, format_metadata, and model_features - metadata.update(sidecar.get("instance_metadata", {})) - metadata.update(sidecar.get("format_metadata", {})) - metadata.update(sidecar.get("model_features", {})) - return InstanceInfo(metadata) - - @classmethod - def dataset_metadata(cls) -> DatasetInfo: - """ - Return dataset-level metadata as a :class:`~metadata.DatasetInfo`. - - :class:`~metadata.DatasetInfo` is a ``dict`` subclass, so existing - ``dataset_metadata()['name']`` access continues to work unchanged. - New structured access (``dataset_metadata().card()``, - ``dataset_metadata().to_croissant()``, etc.) is additive. - Returns: - DatasetInfo: The dataset-level metadata. - """ - if isinstance(cls.citation, str): - citations = [cls.citation] if cls.citation else [] + # Advanced mode: bypass sidecars and collect metadata on demand. + if self._ignore_sidecar: + metadata.update(self.collect_instance_metadata(file=str(instance))) + return InstanceInfo(metadata) else: - citations = list(cls.citation) - - # Serialise FeaturesInfo to a plain dict so the DatasetInfo is JSON-safe - # (the DatasetInfo.features property reconstructs FeaturesInfo on access) - features_dict = None - if cls.features is not None: - features_dict = cls.features.to_dict() - - return DatasetInfo({ - "name": cls.name, - "description": cls.description, - "url": cls.homepage, # backward-compat key - "homepage": cls.homepage, # HuggingFace / TFDS naming - "citation": citations, - "version": cls.version, - "license": cls.license, - "domain": cls.domain, - "tags": list(cls.tags), - "language": cls.language, - "features": features_dict, - "release_notes": cls.release_notes, - }) - - @classmethod - def card(cls, format: str = "markdown") -> str: - """ - Generate a dataset card for this dataset. - - Shorthand for ``cls.dataset_metadata().card(format=format)``. - - Follows HuggingFace Hub convention: YAML frontmatter (machine-readable) - followed by a markdown body (human-readable). - - Parameters - ---------- - format: - Only ``"markdown"`` is currently supported. - - Returns - ------- - str - The dataset card as a string. - """ - return cls.dataset_metadata().card(format=format) + # Load sidecar metadata if it exists + meta_path = self._metadata_path(instance) + if meta_path.exists(): + with open(meta_path, "r") as f: + sidecar = json.load(f) + # Structured: flatten instance_metadata, format_metadata, and model_features + metadata.update(sidecar.get("instance_metadata", {})) + metadata.update(sidecar.get("format_metadata", {})) + metadata.update(sidecar.get("model_features", {})) + return InstanceInfo(metadata) # ---------------------------------------------------------------------------- # @@ -786,10 +834,15 @@ def _list_instances(self) -> list: ]) def __len__(self) -> int: - """Return the total number of instances.""" + """ + Return the total number of instances. + """ return len(self._list_instances()) def __getitem__(self, index: int) -> Tuple[Any, Any]: + """ + Return the instance and metadata at the given index. + """ if index < 0 or index >= len(self): raise IndexError("Index out of range") @@ -912,10 +965,10 @@ def _collect_one_metadata(self, file_path): # Build structured sidecar sidecar = { - "dataset": self.dataset_metadata(), + "dataset": self.dataset_metadata().to_jsonable(), "instance_name": stem, "source_file": str(file_path.relative_to(self.dataset_dir)), - "category": self.category(), + "categories": self.categories(), "instance_metadata": portable, "format_metadata": format_specific, } @@ -952,40 +1005,12 @@ def _collect_one_metadata(self, file_path): # ----------------------------- Download methods ----------------------------- # - @staticmethod - def _try_origin(base_url: str, target: str, destination: str, desc: str, chunk_size: int) -> Optional[pathlib.Path]: - """ - Try to download a file from a specific origin URL. - - Arguments: - base_url (str): Base URL to try - target (str): Target filename - destination (str): Destination path - desc (str): Description for progress bar - chunk_size (int): Chunk size for download - - Returns: - pathlib.Path if successful, None if failed - """ - try: - full_url = base_url.rstrip('/') + '/' + target.lstrip('/') - req = Request(full_url) - with urlopen(req) as response: - total_size = int(response.headers.get('Content-Length', 0)) - - FileDataset._download_sequential(full_url, destination, total_size, desc, chunk_size) - return pathlib.Path(destination) - except (HTTPError, URLError): - return None - @staticmethod def _download_file(url: str, target: str, destination: Optional[str] = None, desc: str = None, - chunk_size: int = 1024 * 1024, - origins: Optional[List[str]] = None) -> os.PathLike: + chunk_size: int = 1024 * 1024) -> os.PathLike: """ Download a file from a URL with progress bar and speed information. - Supports multiple origins with fallback. This method provides a reusable download function with progress updates similar to pip and uv, showing download progress, speed, and ETA. @@ -997,7 +1022,6 @@ def _download_file(url: str, target: str, destination: Optional[str] = None, desc (str, optional): Description to show in the progress bar. If None, uses the filename. chunk_size (int): Size of each chunk for download in bytes (default=1MB). - origins (List[str], optional): List of alternative URL bases to try first. Returns: str: The destination path where the downloaded file is saved. @@ -1016,14 +1040,6 @@ def _download_file(url: str, target: str, destination: Optional[str] = None, if dest_dir: os.makedirs(dest_dir, exist_ok=True) - # Try custom origins first if provided - if origins: - for origin_url in origins: - result = FileDataset._try_origin(origin_url, target, destination, desc, chunk_size) - if result is not None: - return result - - # Fall back to original URL try: req = Request(url + target) with urlopen(req) as response: @@ -1041,12 +1057,69 @@ def _download_file(url: str, target: str, destination: Optional[str] = None, except (HTTPError, URLError) as e: raise ValueError(f"Failed to download file from {url + target}. Error: {str(e)}") + @staticmethod + def _download_sequential(url: str, filepath: os.PathLike, total_size: int, desc: str, + chunk_size: int = 1024 * 1024): + """Download file sequentially with progress bar.""" + import sys + + # Convert to Path if it's a string + if isinstance(filepath, str): + filepath = pathlib.Path(filepath) + + # Ensure parent directory exists + filepath.parent.mkdir(parents=True, exist_ok=True) + + req = Request(url) + with urlopen(req) as response: + if tqdm is not None: + if total_size > 0: + with tqdm(total=total_size, unit='B', unit_scale=True, + unit_divisor=1024, desc=f"Downloading {desc}", file=sys.stdout, + miniters=1, dynamic_ncols=True, ascii=False) as pbar: + with open(filepath, 'wb') as f: + while True: + chunk = response.read(chunk_size) + if not chunk: + break + f.write(chunk) + pbar.update(len(chunk)) + else: + # Unknown size + with tqdm(unit='B', unit_scale=True, unit_divisor=1024, + desc=f"Downloading {desc}", file=sys.stdout, miniters=1, + dynamic_ncols=True, ascii=False) as pbar: + with open(filepath, 'wb') as f: + while True: + chunk = response.read(chunk_size) + if not chunk: + break + f.write(chunk) + pbar.update(len(chunk)) + else: + # Fallback to simple download if tqdm is not available + downloaded = 0 + with open(filepath, 'wb') as f: + while True: + chunk = response.read(chunk_size) + if not chunk: + break + f.write(chunk) + downloaded += len(chunk) + if total_size > 0: + percent = (downloaded / total_size) * 100 + sys.stdout.write(f"\r\033[KDownloading {desc}: {_format_bytes(downloaded)}/{_format_bytes(total_size)} ({percent:.1f}%)") + else: + sys.stdout.write(f"\r\033[KDownloading {desc}: {_format_bytes(downloaded)}...") + sys.stdout.flush() + sys.stdout.write("\n") + sys.stdout.flush() + @staticmethod def _download_parallel(urls_and_targets: List[Tuple[str, str]], base_url: str, destination_dir: str, desc_prefix: str = "Downloading", chunk_size: int = 1024 * 1024, - max_workers: Optional[int] = None, - origins: Optional[List[str]] = None) -> List[pathlib.Path]: + max_workers: Optional[int] = None) -> List[pathlib.Path]: """ Download multiple files in parallel from a base URL. @@ -1057,7 +1130,6 @@ def _download_parallel(urls_and_targets: List[Tuple[str, str]], base_url: str, desc_prefix (str): Prefix for progress bar descriptions chunk_size (int): Chunk size for downloads max_workers (int, optional): Maximum number of parallel workers. Defaults to min(32, num_files) - origins (List[str], optional): List of alternative URL bases to try first Returns: List[pathlib.Path]: List of downloaded file paths @@ -1073,15 +1145,7 @@ def _download_parallel(urls_and_targets: List[Tuple[str, str]], base_url: str, def download_one(url_suffix: str, target: str) -> Tuple[Optional[pathlib.Path], Optional[str]]: dest_path = os.path.join(destination_dir, target) desc = f"{desc_prefix} {target}" - - # Try custom origins first - if origins: - for origin_url in origins: - result = FileDataset._try_origin(origin_url, url_suffix + target, dest_path, desc, chunk_size) - if result is not None: - return result, None - - # Fall back to original URL + try: full_url = base_url.rstrip('/') + '/' + url_suffix.lstrip('/') + target req = Request(full_url) @@ -1114,64 +1178,6 @@ def download_one(url_suffix: str, target: str) -> Tuple[Optional[pathlib.Path], return downloaded_files - @staticmethod - def _download_sequential(url: str, filepath: os.PathLike, total_size: int, desc: str, - chunk_size: int = 1024 * 1024): - """Download file sequentially with progress bar.""" - import sys - - # Convert to Path if it's a string - if isinstance(filepath, str): - filepath = pathlib.Path(filepath) - - # Ensure parent directory exists - filepath.parent.mkdir(parents=True, exist_ok=True) - - req = Request(url) - with urlopen(req) as response: - if tqdm is not None: - if total_size > 0: - with tqdm(total=total_size, unit='B', unit_scale=True, - unit_divisor=1024, desc=f"Downloading {desc}", file=sys.stdout, - miniters=1, dynamic_ncols=True, ascii=False) as pbar: - with open(filepath, 'wb') as f: - while True: - chunk = response.read(chunk_size) - if not chunk: - break - f.write(chunk) - pbar.update(len(chunk)) - else: - # Unknown size - with tqdm(unit='B', unit_scale=True, unit_divisor=1024, - desc=f"Downloading {desc}", file=sys.stdout, miniters=1, - dynamic_ncols=True, ascii=False) as pbar: - with open(filepath, 'wb') as f: - while True: - chunk = response.read(chunk_size) - if not chunk: - break - f.write(chunk) - pbar.update(len(chunk)) - else: - # Fallback to simple download if tqdm is not available - downloaded = 0 - with open(filepath, 'wb') as f: - while True: - chunk = response.read(chunk_size) - if not chunk: - break - f.write(chunk) - downloaded += len(chunk) - if total_size > 0: - percent = (downloaded / total_size) * 100 - sys.stdout.write(f"\r\033[KDownloading {desc}: {_format_bytes(downloaded)}/{_format_bytes(total_size)} ({percent:.1f}%)") - else: - sys.stdout.write(f"\r\033[KDownloading {desc}: {_format_bytes(downloaded)}...") - sys.stdout.flush() - sys.stdout.write("\n") - sys.stdout.flush() - def from_files(dataset_dir: os.PathLike, extension: str = ".txt") -> FileDataset: """ Create a FileDataset from a list of files. @@ -1215,6 +1221,7 @@ def instance_metadata(self, file: os.PathLike) -> dict: return FromFilesDataset(dataset_dir, extension) +# Not implemented yet class URLDataset(IndexedDataset): """ Abstract base class for URL-backed datasets. @@ -1223,12 +1230,14 @@ class URLDataset(IndexedDataset): """ pass +# Not implemented yet class StreamingDataset(IterableDataset): """ Abstract base class for streaming datasets. """ pass +# Not implemented yet class GeneratedDataset(IterableDataset): """ Abstract base class for generated datasets. diff --git a/docs/api/tools.rst b/docs/api/tools.rst index 022761aa1..55d1706f3 100644 --- a/docs/api/tools.rst +++ b/docs/api/tools.rst @@ -3,4 +3,16 @@ Tools (:mod:`cpmpy.tools`) .. automodule:: cpmpy.tools :members: - :inherited-members: \ No newline at end of file + :inherited-members: + +.. toctree:: + :maxdepth: 1 + :caption: Tools: + + tools/dataset + tools/readers + tools/writers + tools/benchmarks + tools/benchmark_runner + tools/dimacs + tools/xcsp3 \ No newline at end of file From d99b77c17c7c3af1b2d055117496f76df0961bc3 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 5 Mar 2026 11:17:30 +0100 Subject: [PATCH 132/152] Clean utils --- cpmpy/tools/datasets/core.py | 8 +- cpmpy/tools/datasets/utils.py | 193 ++++++++++++++++++++++++++++++++-- 2 files changed, 189 insertions(+), 12 deletions(-) diff --git a/cpmpy/tools/datasets/core.py b/cpmpy/tools/datasets/core.py index 9c5f3ac0b..cc47ebd07 100644 --- a/cpmpy/tools/datasets/core.py +++ b/cpmpy/tools/datasets/core.py @@ -944,10 +944,12 @@ def _collect_all_metadata(self, force: bool = False, workers: int = 1): def _collect_one_metadata(self, file_path): """Collect metadata for a single instance file.""" meta_path = self._metadata_path(file_path) + metadata_error = None try: instance_meta = self.collect_instance_metadata(str(file_path)) except Exception as e: - instance_meta = {"_metadata_error": str(e)} + instance_meta = {} + metadata_error = str(e) # Separate portable from format-specific fields portable = portable_instance_metadata(instance_meta) @@ -973,8 +975,8 @@ def _collect_one_metadata(self, file_path): "format_metadata": format_specific, } - if "_metadata_error" in instance_meta: - sidecar["_metadata_error"] = instance_meta["_metadata_error"] + if metadata_error is not None: + sidecar["_metadata_error"] = metadata_error # Preserve previously extracted model features if present. # Otherwise, compute them from the parsed model when possible. diff --git a/cpmpy/tools/datasets/utils.py b/cpmpy/tools/datasets/utils.py index ae48e8701..a75c2ee5d 100644 --- a/cpmpy/tools/datasets/utils.py +++ b/cpmpy/tools/datasets/utils.py @@ -1,11 +1,8 @@ """ -Dataset utilities: generic download manager. - -Downloads one or multiple files from URLs. Supports optional parallel downloads -via a configurable worker count. How files are fetched (HTTP, progress bars, -chunking) is encapsulated here; datasets only pass (url, destination) and options. +Dataset utilities. """ +import json import pathlib import warnings from concurrent.futures import ThreadPoolExecutor, as_completed @@ -13,8 +10,187 @@ from urllib.request import Request, urlopen +from .metadata import ( + InstanceInfo, DatasetInfo, FeaturesInfo, FieldInfo, + _MODEL_FEATURE_FIELDS, _FORMAT_SPECIFIC_PREFIXES, +) + + +def portable_instance_metadata(metadata: dict) -> dict: + """ + Filter metadata to only portable, domain-specific fields. + + Strips model features (num_variables, constraint_types, ...) and + format-specific fields (opb_*, wcnf_*, mps_*, ...) linked to a specific + file format. + + Keeps domain-specific metadata that is independent of the file format, + such as ``jobs``, ``machines``, ``optimum``, ``horizon``, ``bounds``, etc. + + Arguments: + metadata (dict): Full sidecar metadata dictionary. + + Returns: + dict with only portable fields. + """ + return { + k: v for k, v in metadata.items() + if not k.startswith("_") + and k not in _MODEL_FEATURE_FIELDS + and not any(k.startswith(p) for p in _FORMAT_SPECIFIC_PREFIXES) + } + +def extract_model_features(model) -> dict: + """ + Extract generic CP features from a CPMpy Model. + + Arguments: + model: a cpmpy.Model instance + + Returns: + dict with keys: num_variables, num_bool_variables, num_int_variables, + num_constraints, constraint_types, has_objective, objective_type, + domain_size_min, domain_size_max, domain_size_mean + """ + from cpmpy.transformations.get_variables import get_variables_model + from cpmpy.expressions.variables import _BoolVarImpl + from cpmpy.expressions.core import Expression + from cpmpy.expressions.utils import is_any_list + + variables = get_variables_model(model) + + num_bool = sum(1 for v in variables if isinstance(v, _BoolVarImpl)) + num_int = len(variables) - num_bool + + # Domain sizes (lb/ub available on all variable types) + domain_sizes = [int(v.ub) - int(v.lb) + 1 for v in variables] if variables else [] + + # Constraint types: collect .name from top-level constraints + constraint_type_counts = {} + + def _count_constraints(c): + if is_any_list(c): + for sub in c: + _count_constraints(sub) + elif isinstance(c, Expression): + name = c.name + constraint_type_counts[name] = constraint_type_counts.get(name, 0) + 1 + + for c in model.constraints: + _count_constraints(c) + + num_constraints = sum(constraint_type_counts.values()) + + # Objective + has_obj = model.objective_ is not None + obj_type = "none" + if has_obj: + obj_type = "min" if model.objective_is_min else "max" + + return { + "num_variables": len(variables), + "num_bool_variables": num_bool, + "num_int_variables": num_int, + "num_constraints": num_constraints, + "constraint_types": constraint_type_counts, + "has_objective": has_obj, + "objective_type": obj_type, + "domain_size_min": min(domain_sizes) if domain_sizes else None, + "domain_size_max": max(domain_sizes) if domain_sizes else None, + "domain_size_mean": round(sum(domain_sizes) / len(domain_sizes), 2) if domain_sizes else None, + } + + +def _init_metadata_worker(context_dict, collect_metadata_func, reader_func, open_func): + """Initialize worker process with dataset context.""" + global _metadata_worker_context + _metadata_worker_context = context_dict.copy() + _metadata_worker_context['collect_instance_metadata'] = collect_metadata_func + _metadata_worker_context['reader'] = reader_func + _metadata_worker_context['open_func'] = open_func + + +def _collect_one_metadata_worker(file_path_str): + """Worker function for process-based metadata collection.""" + global _metadata_worker_context + file_path = pathlib.Path(file_path_str) + dataset_dir = pathlib.Path(_metadata_worker_context['dataset_dir']) + meta_path = dataset_dir / (file_path.name + _metadata_worker_context['metadata_extension']) + + # Collect instance metadata using the provided function + collect_metadata = _metadata_worker_context['collect_instance_metadata'] + metadata_error = None + try: + instance_meta = collect_metadata(str(file_path)) + except Exception as e: + instance_meta = {} + metadata_error = str(e) + + # Separate portable from format-specific fields + portable = portable_instance_metadata(instance_meta) + format_specific = { + k: v for k, v in instance_meta.items() + if k not in portable and not k.startswith("_") + } + + # Derive instance name + stem = file_path.stem + for ext in (".xml", ".wcnf", ".opb"): + if stem.endswith(ext): + stem = stem[:len(stem) - len(ext)] + break + + # Build structured sidecar + sidecar = { + "dataset": _metadata_worker_context['dataset_metadata'], + "instance_name": stem, + "source_file": str(file_path.relative_to(dataset_dir)), + "category": _metadata_worker_context['category'], + "instance_metadata": portable, + "format_metadata": format_specific, + } + + if metadata_error is not None: + sidecar["_metadata_error"] = metadata_error + + # Preserve or compute model features + model_features = None + if meta_path.exists(): + try: + with open(meta_path, "r") as f: + existing = json.load(f) + if "model_features" in existing: + model_features = existing["model_features"] + except (json.JSONDecodeError, IOError): + pass + + if model_features is None: + reader = _metadata_worker_context['reader'] + open_func = _metadata_worker_context['open_func'] + if not callable(reader): + raise TypeError( + f"Cannot extract model features for {file_path}: " + "no dataset reader configured." + ) + model = reader(str(file_path), open=open_func) + model_features = extract_model_features(model) + + sidecar["model_features"] = model_features + + with open(meta_path, "w") as f: + json.dump(sidecar, f, indent=2) + + return str(file_path) + + +# ---------------------------------------------------------------------------- # +# Download utilities. # +# ---------------------------------------------------------------------------- # + def _get_content_length(url: str) -> int: - """Return Content-Length for url, or 0 if unknown.""" + """ + Return Content-Length for url, or 0 if unknown. + """ try: req = Request(url) req.get_method = lambda: "HEAD" @@ -23,7 +199,6 @@ def _get_content_length(url: str) -> int: except Exception: return 0 - def _download_url( url: str, destination: Union[str, pathlib.Path], @@ -42,8 +217,8 @@ def _download_url( desc = destination.name total_size = _get_content_length(url) if _sequential_impl is None: - from cpmpy.tools.dataset._base import _Dataset - _sequential_impl = _Dataset._download_sequential + from cpmpy.tools.datasets.core import FileDataset + _sequential_impl = FileDataset._download_sequential _sequential_impl(url, destination, total_size, desc, chunk_size) return destination From b425c2286e3250133095f50ea3ccc74f5ab1828e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 5 Mar 2026 11:18:58 +0100 Subject: [PATCH 133/152] Remove unused config --- cpmpy/tools/datasets/__init__.py | 2 - cpmpy/tools/datasets/config.py | 74 -------------------------------- 2 files changed, 76 deletions(-) delete mode 100644 cpmpy/tools/datasets/config.py diff --git a/cpmpy/tools/datasets/__init__.py b/cpmpy/tools/datasets/__init__.py index 2ef1bd91a..dca7fea28 100644 --- a/cpmpy/tools/datasets/__init__.py +++ b/cpmpy/tools/datasets/__init__.py @@ -1,7 +1,5 @@ from ._base import ( - extract_model_features, expand_varying_kwargs, - portable_instance_metadata, FileDataset, ) from .metadata import ( diff --git a/cpmpy/tools/datasets/config.py b/cpmpy/tools/datasets/config.py deleted file mode 100644 index f2fc0e14c..000000000 --- a/cpmpy/tools/datasets/config.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Configuration for CPMpy dataset download origins. - -This module provides configuration for custom download origins that can be used -as alternatives to the original dataset sources. Origins are tried in order, -falling back to the original source if all custom origins fail. - -Configuration can be set via: -1. Environment variables (CPMPY_DATASET_ORIGINS_{DATASET_NAME}) -2. This config file -3. Class attributes in dataset classes -""" - -import os -from typing import Dict, List, Optional - -# Default origins configuration -# Format: {dataset_name: [list of URL bases]} -_DEFAULT_ORIGINS: Dict[str, List[str]] = { - # Example: - # "xcsp3": ["https://cpmpy-datasets.example.com/xcsp3"], - # "mse": ["https://cpmpy-datasets.example.com/mse"], -} - -def get_origins(dataset_name: str) -> List[str]: - """ - Get custom origins for a dataset. - - Checks in order: - 1. Environment variable CPMPY_DATASET_ORIGINS_{DATASET_NAME} - 2. _DEFAULT_ORIGINS dictionary - 3. Returns empty list (no custom origins) - - Arguments: - dataset_name (str): Name of the dataset (e.g., "xcsp3", "mse") - - Returns: - List[str]: List of origin URL bases to try - """ - # Check environment variable first - env_var = f"CPMPY_DATASET_ORIGINS_{dataset_name.upper()}" - env_value = os.getenv(env_var) - if env_value: - # Split by comma and strip whitespace - return [url.strip() for url in env_value.split(",") if url.strip()] - - # Check default origins - return _DEFAULT_ORIGINS.get(dataset_name, []) - -def set_default_origin(dataset_name: str, origin_url: str): - """ - Set a default origin URL for a dataset (for programmatic configuration). - - Arguments: - dataset_name (str): Name of the dataset - origin_url (str): Base URL for the origin - """ - if dataset_name not in _DEFAULT_ORIGINS: - _DEFAULT_ORIGINS[dataset_name] = [] - if origin_url not in _DEFAULT_ORIGINS[dataset_name]: - _DEFAULT_ORIGINS[dataset_name].append(origin_url) - -def set_default_origins(dataset_name: str, origin_urls: List[str]): - """ - Set multiple default origin URLs for a dataset (for programmatic configuration). - - Arguments: - dataset_name (str): Name of the dataset - origin_urls (List[str]): List of base URLs for origins - """ - _DEFAULT_ORIGINS[dataset_name] = origin_urls.copy() - - - From 5424821413f2afb512b1dda11e5f3b034b774e5f Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Thu, 5 Mar 2026 12:09:32 +0100 Subject: [PATCH 134/152] Refactor metadata --- cpmpy/tools/datasets/core.py | 2 + cpmpy/tools/datasets/metadata.py | 235 +++++++++++++++++++------------ 2 files changed, 149 insertions(+), 88 deletions(-) diff --git a/cpmpy/tools/datasets/core.py b/cpmpy/tools/datasets/core.py index cc47ebd07..7dbf0d930 100644 --- a/cpmpy/tools/datasets/core.py +++ b/cpmpy/tools/datasets/core.py @@ -792,6 +792,7 @@ def instance_metadata(self, instance: os.PathLike) -> InstanceInfo: InstanceInfo: The metadata for the instance. """ metadata = { + 'id': str(instance), 'dataset': self.name, 'category': self.category(), 'name': pathlib.Path(instance).name.replace(self.extension, ''), @@ -1215,6 +1216,7 @@ def download(self) -> None: def instance_metadata(self, file: os.PathLike) -> dict: metadata = { + 'id': str(file), 'dataset_dir': str(self.dataset_dir), 'name': pathlib.Path(file).name.replace(self.extension, ''), 'path': file, diff --git a/cpmpy/tools/datasets/metadata.py b/cpmpy/tools/datasets/metadata.py index 59d4d6540..b6d0574e4 100644 --- a/cpmpy/tools/datasets/metadata.py +++ b/cpmpy/tools/datasets/metadata.py @@ -1,6 +1,11 @@ """ Structured Metadata Classes for CPMpy Datasets +When iterating over a dataset, 2-tuples (instance, metadata) are returned. +The metadata is a subclass of the standard python dictionary. It has additional +methods that aid in managing the metadata and help convert it to different formats, +like Croissant, GBD, Dataset Cards, etc. + Provides: - :class:`FieldInfo` — schema for one domain metadata field - :class:`FeaturesInfo` — schema for all domain metadata fields of a dataset @@ -33,7 +38,7 @@ # --------------------------------------------------------------------------- # System-level keys added by instance_metadata() — not domain metadata -_SYSTEM_KEYS: frozenset = frozenset({"dataset", "category", "name", "path"}) +_SYSTEM_KEYS: frozenset = frozenset({"id", "dataset", "categories", "name", "path"}) # Fields produced by extract_model_features() (requires full CPMpy model parse) _MODEL_FEATURE_FIELDS: frozenset = frozenset({ @@ -45,10 +50,10 @@ # Live Python objects added by Load — not JSON-serialisable, excluded from exports _MODEL_OBJECT_KEYS: frozenset = frozenset({ - "decision_variables", + "variables", }) -# Prefixes for format-specific metadata (not portable across translations) +# Prefixes for format-specific metadata (not portable across format translations) _FORMAT_SPECIFIC_PREFIXES: tuple = ("opb_", "wcnf_", "mps_", "xcsp_", "dimacs_") @@ -64,20 +69,24 @@ class FieldInfo: Inspired by HuggingFace ``Value`` and TFDS ``FeatureConnector``, but intentionally simpler — no serialisation semantics needed for CO benchmarks. - Parameters - ---------- - dtype: - Data type string: ``"int"``, ``"float"``, ``"str"``, ``"bool"``, - ``"dict"``, or ``"list"``. - description: - Human-readable description of the field. - nullable: - Whether the field may be absent / ``None`` for some instances. - example: - Optional example value (used in documentation / cards). + Arguments: + + dtype (str or type): Canonical dtype string, schema.org dtype string, + or Python type. + Accepted canonical strings are ``"int"``, ``"float"``, ``"str"``, + ``"bool"``, ``"dict"``, and ``"list"``. Accepted schema.org strings + are ``"sc:Integer"``, ``"sc:Float"``, ``"sc:Text"``, + ``"sc:Boolean"``, ``"sc:StructuredValue"``, and ``"sc:ItemList"``. + Accepted Python types are ``int``, ``float``, ``str``, ``bool``, + ``dict``, and ``list``. + Values are normalised to the canonical string representation at + construction time. + description (str): Human-readable description of the field. + nullable (bool): Whether the field may be absent / ``None`` for some instances. + example (Any): Optional example value (used in documentation / cards). """ - dtype: str + dtype: Any description: str = "" nullable: bool = True example: Any = None @@ -85,10 +94,50 @@ class FieldInfo: # Maps internal dtype strings → schema.org types (for Croissant export) _DTYPE_TO_SCHEMA_ORG: Dict[str, str] = None # populated below as class var + def __post_init__(self): + self.dtype = self.normalize_dtype(self.dtype) + def schema_org_type(self) -> str: """Return the schema.org dataType string for use in Croissant fields.""" return _DTYPE_TO_SCHEMA_ORG.get(self.dtype, "sc:Text") + @classmethod + def normalize_dtype(cls, dtype: Any) -> str: + """ + Normalise a dtype specification to a canonical dtype string. + + Accepts canonical string dtypes, schema.org dtype strings, and selected + builtin Python types. + Raises when a dtype cannot be normalised. + """ + if isinstance(dtype, str): + if dtype in _DTYPE_TO_SCHEMA_ORG: + return dtype + mapped_schema_dtype = _SCHEMA_ORG_TO_DTYPE.get(dtype) + if mapped_schema_dtype is not None: + return mapped_schema_dtype + known = ", ".join(sorted(_DTYPE_TO_SCHEMA_ORG.keys())) + known_schema = ", ".join(sorted(_SCHEMA_ORG_TO_DTYPE.keys())) + raise ValueError( + f"Unknown dtype string {dtype!r}. " + f"Use a canonical dtype ({known}) or schema.org dtype ({known_schema})." + ) + + if isinstance(dtype, type): + mapped = _PY_TYPE_TO_DTYPE.get(dtype) + if mapped is not None: + return mapped + known_types = ", ".join(t.__name__ for t in _PY_TYPE_TO_DTYPE) + raise TypeError( + f"Cannot normalise Python type {dtype!r} to a dataset dtype. " + f"Known Python types: {known_types}." + ) + + raise TypeError( + "dtype must be a canonical dtype string, schema.org dtype string, " + f"or Python type, got {type(dtype).__name__}." + ) + @classmethod def coerce(cls, value: Any) -> "FieldInfo": """ @@ -96,20 +145,23 @@ def coerce(cls, value: Any) -> "FieldInfo": Accepted forms: - - ``FieldInfo(...)`` — returned as-is - - ``"int"`` — treated as ``FieldInfo(dtype="int")`` - - ``("int", "desc")`` — ``FieldInfo(dtype="int", description="desc")`` - - ``("int", "desc", False)`` — adds ``nullable=False`` + - ``FieldInfo(...)`` — returned as-is + - ``"int"``, ``"sc:Integer"``, or ``int`` — treated as ``FieldInfo(dtype=...)`` + - ``("int", "desc")`` — ``FieldInfo(dtype="int", description="desc")`` + - ``("sc:Text", "desc")`` — ``FieldInfo(dtype="sc:Text", description="desc")`` + - ``(int, "desc")`` — ``FieldInfo(dtype=int, description="desc")`` + - ``("int", "desc", False)`` — adds ``nullable=False`` """ if isinstance(value, cls): return value - if isinstance(value, str): + if isinstance(value, (str, type)): return cls(dtype=value) if isinstance(value, tuple): return cls(*value) raise TypeError( f"Cannot coerce {value!r} to FieldInfo. " - "Use a FieldInfo, a dtype string, or a (dtype, description[, nullable]) tuple." + "Use a FieldInfo, a dtype string or Python type, " + "or a (dtype, description[, nullable]) tuple." ) def to_dict(self) -> dict: @@ -130,6 +182,19 @@ def to_dict(self) -> dict: "list": "sc:ItemList", } +_PY_TYPE_TO_DTYPE: Dict[type, str] = { + int: "int", + float: "float", + str: "str", + bool: "bool", + dict: "dict", + list: "list", +} + +_SCHEMA_ORG_TO_DTYPE: Dict[str, str] = { + schema_type: dtype for dtype, schema_type in _DTYPE_TO_SCHEMA_ORG.items() +} + # --------------------------------------------------------------------------- # FeaturesInfo @@ -240,9 +305,9 @@ class InstanceInfo(dict): """ Per-instance metadata dict with structured access. - Inherits from ``dict`` for full backward compatibility — all existing - ``meta['year']``, ``meta.get('jobs')``, ``for k, v in meta.items()`` - usage continues unchanged. + Inherits from ``dict`` and supports normal dictionary access patterns + such as ``meta['year']``, ``meta.get('jobs')``, and + ``for k, v in meta.items()``. Structured access is additive: @@ -250,19 +315,21 @@ class InstanceInfo(dict): file, info = dataset[0] - # Backward-compatible dict access (unchanged): - info['year'] + # Dict access: + info['name'] info.get('jobs', 0) + info['categories']['year'] # New structured properties: info.id # "jsplib/abz5" + info.category # {"year": 2024, "track": "CSP", ...} info.domain_metadata # {"jobs": 10, "machines": 5, ...} info.model_features # {"num_variables": 100, ...} info.format_metadata # {"opb_num_variables": 12, ...} # Standards converters: - info.to_croissant_example() - info.to_gbd_features() + info.to_croissant() + info.to_gbd() """ @property @@ -270,12 +337,22 @@ def id(self) -> str: """ Stable instance identifier. - Format: ``"dataset/cat_val1/cat_val2/.../instance_name"`` + Uses explicit ``id`` when present (recommended for dataset-defined + identifiers). Otherwise falls back to: + ``"dataset/cat_val1/cat_val2/.../instance_name"``. + + For file-based datasets, ``id`` is typically set to the instance + reference returned as the first element of the dataset ``(x, y)`` + tuple. Example: ``"xcsp3/2024/CSP/AverageAvoiding-20_c24"`` """ + explicit = self.get("id") + if explicit: + return str(explicit) + parts = [str(self.get("dataset", ""))] - cat = self.get("category", {}) + cat = self.get("categories", {}) if isinstance(cat, dict): parts += [str(v) for v in cat.values()] parts.append(str(self.get("name", ""))) @@ -292,9 +369,9 @@ def dataset(self) -> str: return self.get("dataset", "") @property - def category(self) -> dict: + def categories(self) -> dict: """Category dict (year, track, variant, family, …).""" - return self.get("category", {}) + return self.get("categories", {}) @property def domain_metadata(self) -> dict: @@ -392,9 +469,9 @@ def __or__(self, other: dict) -> "InstanceInfo": def __ror__(self, other: dict) -> "InstanceInfo": return InstanceInfo(super().__ror__(other)) - def to_croissant_example(self) -> dict: + def to_croissant(self) -> dict: """ - Convert to a Croissant-compatible example record. + Convert to a Croissant-compatible record. Returns a flat dict with ``id``, domain metadata, and model features. """ @@ -403,19 +480,24 @@ def to_croissant_example(self) -> dict: record.update(self.model_features) return record - def to_gbd_features(self) -> dict: + def to_gbd(self) -> dict: """ Convert to a GBD-style (Global Benchmark Database) feature record. GBD uses hash-based instance IDs; here we use the path-based ``.id`` property as a stable identifier instead. + + .. note:: + + In the future, hash-based instance IDs coming from GBD might be added. + For now, this has to bed added manually. """ record: dict = { "id": self.id, "filename": self.get("name", ""), "dataset": self.get("dataset", ""), } - record.update(self.category) + record.update(self.categories) record.update(self.domain_metadata) record.update(self.model_features) return record @@ -473,43 +555,11 @@ def name(self) -> str: def description(self) -> str: return self.get("description", "") - @property - def url(self) -> str: - """Homepage URL (backward-compat alias for :attr:`homepage`).""" - return self.get("url", "") or self.get("homepage", "") - @property def homepage(self) -> str: """Homepage URL (HuggingFace / TFDS naming convention).""" return self.get("homepage", "") or self.get("url", "") - @property - def version(self) -> Optional[str]: - return self.get("version") - - @property - def license(self) -> Optional[Union[str, List[str]]]: - return self.get("license") - - @property - def domain(self) -> str: - """Primary problem domain (e.g. ``"scheduling"``, ``"sat"``, ``"cp"``).""" - return self.get("domain", "constraint_programming") - - @property - def tags(self) -> List[str]: - return self.get("tags", []) - - @property - def language(self) -> Optional[str]: - """ - Problem format / modelling language (e.g. ``"XCSP3"``, ``"OPB"``, ``"JSPLib"``). - - Analogous to HuggingFace's ``language`` field, but for CO format languages - rather than human languages. - """ - return self.get("language") - @property def features(self) -> Optional[FeaturesInfo]: """ @@ -528,27 +578,36 @@ def features(self) -> Optional[FeaturesInfo]: return FeaturesInfo.from_dict(raw) return None - @property - def release_notes(self) -> Optional[Dict[str, str]]: + # -- JSON serialisation --------------------------------------------------- + + def to_jsonable(self) -> dict: """ - Version changelog dict: ``{version_string: description}``. + Return a JSON-serialisable plain dict representation. - Inspired by TFDS ``BuilderConfig.release_notes``. + In particular, this serialises ``features`` (when present) to a plain + dict via :meth:`FeaturesInfo.to_dict`. + """ + data = dict(self) + feats = data.get("features") + if isinstance(feats, FeaturesInfo): + data["features"] = feats.to_dict() + return data - Example:: + def to_json(self, **kwargs) -> str: + """ + Return this metadata as a JSON string. - release_notes = { - "1.0.0": "Initial release.", - "1.1.0": "Added 2024 track instances.", - } + Arguments: + **kwargs: forwarded to :func:`json.dumps`. """ - return self.get("release_notes") + import json + return json.dumps(self.to_jsonable(), **kwargs) # -- Card generation ------------------------------------------------------ def card(self, format: str = "markdown") -> str: """ - Generate a dataset card. + Generate a Dataset Card. Follows the HuggingFace Hub convention: a YAML frontmatter block (machine-readable) followed by a markdown body (human-readable). @@ -739,29 +798,29 @@ def to_croissant(self) -> dict: # Standalone adapter functions (for use as target_transform) # --------------------------------------------------------------------------- -def to_croissant_example(metadata: dict) -> dict: +def to_croissant(metadata: dict) -> dict: """ - Convert instance metadata to a Croissant example record. + Convert instance metadata to a Croissant record. Usable as a ``target_transform``:: - from cpmpy.tools.datasets.metadata import to_croissant_example - dataset = JSPLibDataset(root="data", target_transform=to_croissant_example) + from cpmpy.tools.datasets.metadata import to_croissant + dataset = JSPLibDataset(root="data", target_transform=to_croissant) for instance, record in dataset: print(record["id"], record["jobs"]) """ - return InstanceInfo(metadata).to_croissant_example() + return InstanceInfo(metadata).to_croissant() -def to_gbd_features(metadata: dict) -> dict: +def to_gbd(metadata: dict) -> dict: """ Convert instance metadata to a GBD-style feature record. Usable as a ``target_transform``:: - from cpmpy.tools.datasets.metadata import to_gbd_features - dataset = JSPLibDataset(root="data", target_transform=to_gbd_features) + from cpmpy.tools.datasets.metadata import to_gbd + dataset = JSPLibDataset(root="data", target_transform=to_gbd) for instance, record in dataset: print(record["id"], record["num_constraints"]) """ - return InstanceInfo(metadata).to_gbd_features() + return InstanceInfo(metadata).to_gbd() From df426eacd3539c0195c7aac3b918b26311317a4e Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 6 Mar 2026 13:47:10 +0100 Subject: [PATCH 135/152] Fix imports and names --- cpmpy/tools/datasets/__init__.py | 16 +++++++++++----- cpmpy/tools/datasets/core.py | 7 +++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/cpmpy/tools/datasets/__init__.py b/cpmpy/tools/datasets/__init__.py index dca7fea28..600c78900 100644 --- a/cpmpy/tools/datasets/__init__.py +++ b/cpmpy/tools/datasets/__init__.py @@ -1,14 +1,18 @@ -from ._base import ( +from .core import ( expand_varying_kwargs, FileDataset, ) +from .utils import ( + extract_model_features, + portable_instance_metadata, +) from .metadata import ( InstanceInfo, DatasetInfo, FeaturesInfo, FieldInfo, - to_croissant_example, - to_gbd_features, + to_croissant, + to_gbd, ) __all__ = [ @@ -22,8 +26,8 @@ "DatasetInfo", "FeaturesInfo", "FieldInfo", - "to_croissant_example", - "to_gbd_features", + "to_croissant", + "to_gbd", # Datasets "MIPLibDataset", "JSPLibDataset", @@ -32,6 +36,7 @@ "XCSP3Dataset", "OPBDataset", "MaxSATEvalDataset", + "SATDataset", # Transforms "Compose", "Open", @@ -50,6 +55,7 @@ from .xcsp3 import XCSP3Dataset from .opb import OPBDataset from .mse import MaxSATEvalDataset +from .sat import SATDataset from .transforms import Compose, Open, Load, Serialize, Translate, SaveToFile, Lambda, extract_format_metadata # Backward compatibility alias Parse = Load diff --git a/cpmpy/tools/datasets/core.py b/cpmpy/tools/datasets/core.py index 7dbf0d930..f20000a76 100644 --- a/cpmpy/tools/datasets/core.py +++ b/cpmpy/tools/datasets/core.py @@ -109,9 +109,8 @@ from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed import multiprocessing -from libraries.cpmpy.cpmpy.tools.datasets import FeaturesInfo -from libraries.cpmpy.cpmpy.tools.datasets.metadata import DatasetInfo, InstanceInfo -from libraries.cpmpy.cpmpy.tools.datasets.utils import extract_model_features, portable_instance_metadata +from .metadata import FeaturesInfo, DatasetInfo, InstanceInfo +from .utils import extract_model_features, portable_instance_metadata # tqdm as an optional dependency, provides prettier progress bars try: @@ -851,7 +850,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: file_path = files[index] filename = str(file_path) - metadata = self.instance_metadata(file=filename) + metadata = self.instance_metadata(filename) if self.target_transform: metadata = self.target_transform(metadata) From fdad3db1de0382805294987a09cfd7825178391a Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 6 Mar 2026 13:49:19 +0100 Subject: [PATCH 136/152] Update datasets --- cpmpy/tools/datasets/jsplib.py | 11 ++- cpmpy/tools/datasets/miplib.py | 13 +-- cpmpy/tools/datasets/mse.py | 11 ++- cpmpy/tools/datasets/nurserostering.py | 13 +-- cpmpy/tools/datasets/opb.py | 26 ++---- cpmpy/tools/datasets/psplib.py | 13 +-- cpmpy/tools/datasets/sat.py | 119 ++++++++----------------- cpmpy/tools/datasets/xcsp3.py | 11 ++- 8 files changed, 91 insertions(+), 126 deletions(-) diff --git a/cpmpy/tools/datasets/jsplib.py b/cpmpy/tools/datasets/jsplib.py index 9f4ec507f..b8dc529bc 100644 --- a/cpmpy/tools/datasets/jsplib.py +++ b/cpmpy/tools/datasets/jsplib.py @@ -14,7 +14,7 @@ import numpy as np import cpmpy as cp -from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.core import FileDataset from cpmpy.tools.datasets.metadata import FeaturesInfo, FieldInfo @@ -52,7 +52,7 @@ class JSPLibDataset(FileDataset): # torch.utils.data.Dataset compatible "instance_description": FieldInfo("str", "Human-readable description from file header comments", nullable=True), }) - def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False, metadata_workers: int = 1): + def __init__(self, root: str = ".", transform=None, target_transform=None, download: bool = False, **kwargs): """ Initialize the JSPLib Dataset. @@ -73,7 +73,7 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension="", - metadata_workers=metadata_workers + **kwargs ) @staticmethod @@ -89,6 +89,9 @@ def _loader(content: str): def category(self) -> dict: return {} # no categories + def categories(self) -> dict: + return self.category() + def collect_instance_metadata(self, file) -> dict: """ Extract metadata from instances.json and instance file header. @@ -146,7 +149,7 @@ def download(self): print("Downloading JSPLib instances from github.com/tamy0612/JSPLIB") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) + target_download_path = self._download_file(url, target, destination=str(target_download_path)) except ValueError as e: raise ValueError(f"No dataset available on {url}. Error: {str(e)}") diff --git a/cpmpy/tools/datasets/miplib.py b/cpmpy/tools/datasets/miplib.py index 6dab416a9..2d9def666 100644 --- a/cpmpy/tools/datasets/miplib.py +++ b/cpmpy/tools/datasets/miplib.py @@ -11,7 +11,7 @@ import pathlib import io -from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.core import FileDataset class MIPLibDataset(FileDataset): # torch.utils.data.Dataset compatible @@ -42,7 +42,7 @@ def __init__( year: int = 2024, track: str = "exact-unweighted", transform=None, target_transform=None, download: bool = False, - metadata_workers: int = 1 + **kwargs ): """ Constructor for a dataset object of the MIPLib competition. @@ -70,7 +70,7 @@ def __init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=".mps.gz", - metadata_workers=metadata_workers + **kwargs ) @staticmethod @@ -84,7 +84,7 @@ def reader(file_path, open=open): return load_scip(file_path, open=open) @staticmethod - def loader(content: str): + def _loader(content: str): """ Loader for MIPLib dataset. Loads a CPMpy model from raw MPS/LP content string. @@ -109,6 +109,9 @@ def category(self) -> dict: "year": self.year, "track": self.track } + + def categories(self) -> dict: + return self.category() def download(self): @@ -119,7 +122,7 @@ def download(self): print("Downloading MIPLib instances from miplib.zib.de") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) + target_download_path = self._download_file(url, target, destination=str(target_download_path)) except ValueError as e: raise ValueError(f"No dataset available on {url}. Error: {str(e)}") diff --git a/cpmpy/tools/datasets/mse.py b/cpmpy/tools/datasets/mse.py index de1373fec..c5e540de6 100644 --- a/cpmpy/tools/datasets/mse.py +++ b/cpmpy/tools/datasets/mse.py @@ -14,7 +14,7 @@ import cpmpy as cp from cpmpy.tools.io.wcnf import load_wcnf -from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.core import FileDataset from cpmpy.tools.datasets.metadata import FeaturesInfo @@ -62,7 +62,7 @@ def __init__( transform=None, target_transform=None, download: bool = False, dataset_dir: Optional[os.PathLike] = None, - metadata_workers: int = 1 + **kwargs ): """ Constructor for a dataset object of the MaxSAT Evaluation competition. @@ -98,7 +98,7 @@ def __init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=".wcnf.xz", - metadata_workers=metadata_workers + **kwargs ) @@ -116,6 +116,9 @@ def category(self) -> dict: "track": self.track } + def categories(self) -> dict: + return self.category() + def collect_instance_metadata(self, file) -> dict: """ Extract statistics from WCNF header comments. @@ -155,7 +158,7 @@ def download(self): print(f"Downloading MaxSAT Eval {self.year} {self.track} instances from cs.helsinki.fi") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) + target_download_path = self._download_file(url, target, destination=str(target_download_path)) except ValueError as e: raise ValueError(f"No dataset available for year {self.year} and track {self.track}. Error: {str(e)}") diff --git a/cpmpy/tools/datasets/nurserostering.py b/cpmpy/tools/datasets/nurserostering.py index 918b79c35..e9f84bc78 100644 --- a/cpmpy/tools/datasets/nurserostering.py +++ b/cpmpy/tools/datasets/nurserostering.py @@ -14,7 +14,7 @@ import io import cpmpy as cp -from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.core import FileDataset from cpmpy.tools.datasets.metadata import FeaturesInfo # Optional dependencies @@ -60,7 +60,7 @@ class NurseRosteringDataset(FileDataset): # torch.utils.data.Dataset compatible "num_shifts": ("int", "Number of distinct shift types"), }) - def __init__(self, root: str = ".", transform=None, target_transform=None, download:bool=False, sort_key=None, metadata_workers: int = 1): + def __init__(self, root: str = ".", transform=None, target_transform=None, download:bool=False, sort_key=None, **kwargs): """ Initialize the Nurserostering Dataset. @@ -83,7 +83,7 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=".txt", - metadata_workers=metadata_workers + **kwargs ) @staticmethod @@ -97,7 +97,7 @@ def reader(file_path, open=open): return load_nurserostering(file_path, open=open) @staticmethod - def loader(content: str): + def _loader(content: str): """ Loader for Nurse Rostering dataset. Loads a CPMpy model from raw Nurse Rostering content string. @@ -109,6 +109,9 @@ def loader(content: str): def category(self) -> dict: return {} # no categories + def categories(self) -> dict: + return self.category() + def collect_instance_metadata(self, file) -> dict: """ Extract scheduling metadata from nurse rostering instance. @@ -137,7 +140,7 @@ def download(self): print("Downloading Nurserostering instances from schedulingbenchmarks.org") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) + target_download_path = self._download_file(url, target, destination=str(target_download_path)) except ValueError as e: raise ValueError(f"No dataset available on {url}. Error: {str(e)}") diff --git a/cpmpy/tools/datasets/opb.py b/cpmpy/tools/datasets/opb.py index 1414af6e1..be83c2d47 100644 --- a/cpmpy/tools/datasets/opb.py +++ b/cpmpy/tools/datasets/opb.py @@ -11,7 +11,7 @@ import tarfile import io -from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.core import FileDataset from cpmpy.tools.datasets.metadata import FeaturesInfo, FieldInfo @@ -54,7 +54,7 @@ def __init__( competition: bool = True, transform=None, target_transform=None, download: bool = False, - metadata_workers: int = 1 + **kwargs ): """ Constructor for a dataset object of the PB competition. @@ -91,22 +91,11 @@ def __init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=".opb.xz", - metadata_workers=metadata_workers + **kwargs ) - @staticmethod - def reader(file_path, open=open): - """ - Reader for OPB dataset. - Parses a file path directly into a CPMpy model. - For backward compatibility. Consider using read() + load() instead. - """ - from cpmpy.tools.io.opb import load_opb - return load_opb(file_path, open=open) - - @staticmethod - def loader(content: str): + def _loader(content: str): """ Loader for OPB dataset. Loads a CPMpy model from raw OPB content string. @@ -121,7 +110,10 @@ def category(self) -> dict: "track": self.track } - def collect_instance_metadata(self, file) -> dict: + def categories(self) -> dict: + return self.category() + + def collect_instance_metadata(self, file: os.PathLike) -> dict: """Extract metadata from OPB filename and file header. Parses the `* #variable= ... #constraint= ...` header line and @@ -163,7 +155,7 @@ def download(self): print(f"Downloading OPB {self.year} {self.track} {'competition' if self.competition else 'non-competition'} instances from www.cril.univ-artois.fr") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) + target_download_path = self._download_file(url, target, destination=str(target_download_path)) except ValueError as e: raise ValueError(f"No dataset available for year {self.year}. Error: {str(e)}") diff --git a/cpmpy/tools/datasets/psplib.py b/cpmpy/tools/datasets/psplib.py index 74a7604e0..74574aa7c 100644 --- a/cpmpy/tools/datasets/psplib.py +++ b/cpmpy/tools/datasets/psplib.py @@ -9,7 +9,7 @@ import io import zipfile -from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.core import FileDataset from cpmpy.tools.datasets.metadata import FeaturesInfo, FieldInfo @@ -45,7 +45,7 @@ class PSPLibDataset(FileDataset): # torch.utils.data.Dataset compatible }) - def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False, metadata_workers: int = 1): + def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", transform=None, target_transform=None, download: bool = False, **kwargs): """ Constructor for a dataset object for PSPlib. @@ -83,7 +83,7 @@ def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=f".{self.family_codes[self.variant]}", - metadata_workers=metadata_workers + **kwargs ) @staticmethod @@ -97,7 +97,7 @@ def reader(file_path, open=open): return load_rcpsp(file_path, open=open) @staticmethod - def loader(content: str): + def _loader(content: str): """ Loader for PSPLib dataset. Loads a CPMpy model from raw RCPSP content string. @@ -112,6 +112,9 @@ def category(self) -> dict: "family": self.family } + def categories(self) -> dict: + return self.category() + def collect_instance_metadata(self, file) -> dict: """Extract project metadata from SM file header.""" import re @@ -177,7 +180,7 @@ def download(self): print(f"Downloading PSPLib {self.variant} {self.family} instances from www.om-db.wi.tum.de") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) + target_download_path = self._download_file(url, target, destination=str(target_download_path)) except ValueError as e: raise ValueError(f"No dataset available for variant {self.variant} and family {self.family}. Error: {str(e)}") diff --git a/cpmpy/tools/datasets/sat.py b/cpmpy/tools/datasets/sat.py index 770539564..991cf05af 100644 --- a/cpmpy/tools/datasets/sat.py +++ b/cpmpy/tools/datasets/sat.py @@ -1,7 +1,8 @@ """ -SAT Competition Dataset +SAT Competition Dataset. -Instances from the benchmark database (benchmark-database.de) for the SAT competition. +Instances are fetched from benchmark-database.de via ``getinstances``. +Each returned line is an instance URL, usually served as XZ-compressed DIMACS. """ import io @@ -12,31 +13,31 @@ import tempfile from urllib.request import Request, urlopen -from cpmpy.tools.dataset._base import URLDataset -from cpmpy.tools.dataset.utils import download as download_manager +from cpmpy.tools.datasets.core import FileDataset +from cpmpy.tools.datasets.metadata import FeaturesInfo -# Base URL for the instance list (getinstances returns one file URL per line) INSTANCE_LIST_URL = "https://benchmark-database.de/getinstances" -DEFAULT_QUERY = "track=main_2025" -DEFAULT_CONTEXT = "cnf" -class SATDataset(URLDataset): +class SATDataset(FileDataset): """ SAT competition benchmark dataset (DIMACS CNF). - - Instances are listed at benchmark-database.de via getinstances; each line - is a URL to a CNF file (served XZ-compressed). Files are stored as .cnf.xz. - - More information: https://benchmark-database.de/ """ name = "sat" description = "SAT competition benchmark instances (DIMACS CNF) from benchmark-database.de." - url = "https://benchmark-database.de/" - license = "" + homepage = "https://benchmark-database.de/" citation = [] + version = "2025" + license = "competition-specific" + domain = "sat" + tags = ["satisfaction", "sat", "cnf", "dimacs"] + language = "DIMACS-CNF" + features = FeaturesInfo({ + "dimacs_num_variables": ("int", "Number of propositional variables from DIMACS p-line"), + "dimacs_num_clauses": ("int", "Number of clauses from DIMACS p-line"), + }) def __init__( self, @@ -65,32 +66,17 @@ def __init__( self.context = context dataset_dir = self.root / self.name / track / context - super().__init__( dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=".cnf.xz", - **kwargs + **kwargs, ) @staticmethod - def reader(file_path, open=open): - """ - Reader for SAT dataset. - Parses a DIMACS CNF file path into a CPMpy model (uses open for .cnf.xz). - """ - with open(file_path) as f: - content = f.read() - return SATDataset.loader(content) - - @staticmethod - def loader(content: str): - """ - Loader for SAT dataset. - Loads a CPMpy model from raw DIMACS CNF content string. - """ + def _loader(content: str): from cpmpy.tools.dimacs import load_dimacs with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".cnf") as tmp: tmp.write(content) @@ -100,27 +86,17 @@ def loader(content: str): finally: os.unlink(tmp_path) + def category(self) -> dict: + return {"track": self.track, "context": self.context} + + def categories(self) -> dict: + return self.category() + def open(self, instance: os.PathLike) -> io.TextIOBase: - """Open instance file; use lzma for .cnf.xz (XZ-compressed) files.""" path = str(instance) return lzma.open(instance, "rt") if path.endswith(".xz") else open(instance, "r") - def instance_metadata(self, file: pathlib.Path) -> dict: - """Add instance metadata; ensure name strips .cnf from stem (e.g. hash.cnf.xz -> hash).""" - metadata = super().instance_metadata(file) - stem = pathlib.Path(file).stem - if stem.endswith(".cnf"): - metadata["name"] = stem[:-4] - return metadata - - def category(self) -> dict: - return { - "track": self.track, - "context": self.context, - } - def collect_instance_metadata(self, file) -> dict: - """Extract num variables and num clauses from DIMACS p-line.""" result = {} try: with self.open(file) as f: @@ -136,26 +112,19 @@ def collect_instance_metadata(self, file) -> dict: pass return result - def download(self, **kwargs): - """Fetch the instance list from getinstances, then download each CNF file via the download manager.""" + def download(self): params = f"query=track%3D{self.track}&context={self.context}" list_url = f"{INSTANCE_LIST_URL}?{params}" - print(f"Fetching SAT instance list from {list_url}") + req = Request(list_url) with urlopen(req) as response: body = response.read().decode("utf-8") - # One file URL per line (e.g. http://benchmark-database.de/file/00d5a43a...) file_urls = [line.strip() for line in body.splitlines() if line.strip()] - if not file_urls: - raise ValueError( - f"No instances returned from {list_url}. " - "Check track and context parameters." - ) + raise ValueError(f"No instances returned from {list_url}. Check track/context.") - # Use last path segment (hash) as filename; store as .cnf.xz (server sends XZ-compressed) def path_to_name(url: str) -> str: name = url.rstrip("/").split("/")[-1] if name.lower().endswith(".cnf.xz"): @@ -166,34 +135,20 @@ def path_to_name(url: str) -> str: self.dataset_dir.mkdir(parents=True, exist_ok=True) - # Deduplicate by destination (instance list may contain duplicate URLs) - seen_dest = set() + seen_targets = set() items = [] for url in file_urls: - dest = self.dataset_dir / path_to_name(url) - if dest not in seen_dest: - seen_dest.add(dest) - items.append((url, dest)) - - workers = kwargs.get("workers", 1) - print(f"Downloading {len(items)} SAT instances to {self.dataset_dir} (workers={workers})") - download_manager( - items, - desc_prefix="Instance", - skip_existing=True, - **kwargs, - ) + target = path_to_name(url) + if target not in seen_targets: + seen_targets.add(target) + items.append((url, target)) - files = self._list_instances() - if not files: - raise ValueError( - f"Download completed but no .cnf.xz files found in {self.dataset_dir}" - ) - self._collect_all_metadata() - print(f"Finished downloading {len(files)} instances") + print(f"Downloading {len(items)} SAT instances to {self.dataset_dir}") + for url, target in items: + destination = str(self.dataset_dir / target) + self._download_file(url=url, target="", destination=destination, desc=target) if __name__ == "__main__": - dataset = SATDataset(track="main_2025", context="cnf", download=True) + dataset = SATDataset(track="main_2025", context="cnf", download=False) print("Dataset size:", len(dataset)) - print("Instance 0:", dataset[0]) diff --git a/cpmpy/tools/datasets/xcsp3.py b/cpmpy/tools/datasets/xcsp3.py index d7aaef91c..5d9a05ce0 100644 --- a/cpmpy/tools/datasets/xcsp3.py +++ b/cpmpy/tools/datasets/xcsp3.py @@ -11,7 +11,7 @@ import io import cpmpy as cp -from cpmpy.tools.datasets._base import FileDataset +from cpmpy.tools.datasets.core import FileDataset from cpmpy.tools.datasets.metadata import FeaturesInfo @@ -47,7 +47,7 @@ class XCSP3Dataset(FileDataset): # torch.utils.data.Dataset compatible }) - def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transform=None, target_transform=None, download: bool = False, metadata_workers: int = 1): + def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transform=None, target_transform=None, download: bool = False, **kwargs): """ Initialize the XCSP3 Dataset. """ @@ -67,7 +67,7 @@ def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transf dataset_dir=dataset_dir, transform=transform, target_transform=target_transform, download=download, extension=".xml.lzma", - metadata_workers=metadata_workers + **kwargs ) @@ -87,6 +87,9 @@ def category(self) -> dict: "track": self.track } + def categories(self) -> dict: + return self.category() + def collect_instance_metadata(self, file) -> dict: """Extract instance type (CSP/COP) from XCSP3 XML root element.""" import re @@ -121,7 +124,7 @@ def download(self): print(f"Downloading XCSP3 {self.year} instances from www.cril.univ-artois.fr") try: - target_download_path = self._download_file(url, target, destination=str(target_download_path), origins=self.origins) + target_download_path = self._download_file(url, target, destination=str(target_download_path)) except ValueError as e: raise ValueError(f"No dataset available for year {self.year}. Error: {str(e)}") From 766112a64b5d7bbad2877e09e89e42ce9a44b793 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 6 Mar 2026 13:50:03 +0100 Subject: [PATCH 137/152] opb writer fix transformations --- cpmpy/tools/io/opb.py | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/cpmpy/tools/io/opb.py b/cpmpy/tools/io/opb.py index c0e34c152..a7e790d31 100644 --- a/cpmpy/tools/io/opb.py +++ b/cpmpy/tools/io/opb.py @@ -25,7 +25,7 @@ import sys import argparse from io import StringIO -from typing import Union +from typing import Union, Optional, Callable from functools import reduce from operator import mul @@ -33,10 +33,14 @@ import cpmpy as cp from cpmpy.transformations.normalize import toplevel_list,simplify_boolean from cpmpy.transformations.safening import no_partial_functions, safen_objective -from cpmpy.transformations.decompose_global import decompose_in_tree, decompose_objective from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective from cpmpy.transformations.reification import only_implies, only_bv_reifies -from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv_wsum +from cpmpy.transformations.linearize import ( + decompose_linear, + decompose_linear_objective, + linearize_constraint, + only_positive_bv_wsum, +) from cpmpy.transformations.int2bool import int2bool, _encode_int_var, _decide_encoding from cpmpy.transformations.get_variables import get_variables from cpmpy.expressions.variables import _IntVarImpl, NegBoolView, _BoolVarImpl @@ -201,7 +205,7 @@ def load_opb(opb: Union[str, os.PathLike], open=open) -> cp.Model: return model -def write_opb(model, fname=None, encoding="auto", header=None): +def write_opb(model, fname=None, encoding="auto", header=None, open=None): """ Export a CPMpy model to the OPB (Pseudo-Boolean) format. @@ -216,6 +220,10 @@ def write_opb(model, fname=None, encoding="auto", header=None): encoding (str, optional): The encoding used for `int2bool`. Options: ("auto", "direct", "order", "binary"). header (str, optional): Optional header text to add as OPB comments. If provided, each line will be prefixed with "* ". + open (callable, optional): Callable to open the file for writing (default: builtin ``open``). + Called as ``open(fname, "w")``. This mirrors the ``open=`` argument + in loaders and allows custom compression or I/O (e.g. + ``lambda p, mode='w': lzma.open(p, 'wt')``). Returns: str or None: The OPB string if `fname` is None, otherwise nothing (writes to file). @@ -277,9 +285,9 @@ def write_opb(model, fname=None, encoding="auto", header=None): contents = "\n".join(out) if fname is None: return contents - else: - with open(fname, "w") as f: - f.write(contents) + opener = open if open is not None else _std_open + with opener(fname, "w") as f: + f.write(contents) def _normalized_comparison(lst_of_expr): """ @@ -373,9 +381,13 @@ def _transform(cpm_expr, csemap, ivarmap, encoding="auto"): cpm_cons = toplevel_list(cpm_expr) cpm_cons = no_partial_functions(cpm_cons, safen_toplevel={"div", "mod", "element"}) - cpm_cons = decompose_in_tree(cpm_cons, - supported={"alldifferent"}, # alldiff has a specialized MIP decomp in linearize - csemap=csemap + # Use linear-specific decompositions (e.g. AllDifferent.decompose_linear) + # before linearization, consistent with MIP backends. + cpm_cons = decompose_linear( + cpm_cons, + supported=frozenset(), + supported_reified=frozenset(), + csemap=csemap, ) cpm_cons = simplify_boolean(cpm_cons) cpm_cons = flatten_constraint(cpm_cons, csemap=csemap) # flat normal form @@ -395,8 +407,12 @@ def _transform_objective(expr, csemap, ivarmap, encoding="auto"): # transform objective obj, safe_cons = safen_objective(expr) - obj, decomp_cons = decompose_objective(obj, supported={"alldifferent"}, - csemap=csemap) + obj, decomp_cons = decompose_linear_objective( + obj, + supported=frozenset(), + supported_reified=frozenset(), + csemap=csemap, + ) obj, flat_cons = flatten_objective(obj, csemap=csemap) obj = only_positive_bv_wsum(obj) # remove negboolviews From 9d7dadeae6245b8d7f70b8d6bd8e15648ad4e022 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 6 Mar 2026 13:50:50 +0100 Subject: [PATCH 138/152] scip small changes --- cpmpy/tools/io/scip.py | 51 +++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/cpmpy/tools/io/scip.py b/cpmpy/tools/io/scip.py index ccb22ab37..a245669cb 100644 --- a/cpmpy/tools/io/scip.py +++ b/cpmpy/tools/io/scip.py @@ -44,10 +44,10 @@ from cpmpy.expressions.core import BoolVal, Comparison, Operator from cpmpy.expressions.variables import _NumVarImpl, _BoolVarImpl, NegBoolView, _IntVarImpl from cpmpy.transformations.comparison import only_numexpr_equality -from cpmpy.transformations.decompose_global import decompose_in_tree, decompose_objective +from cpmpy.transformations.decompose_global import decompose_objective from cpmpy.transformations.flatten_model import flatten_constraint, flatten_objective from cpmpy.transformations.get_variables import get_variables -from cpmpy.transformations.linearize import linearize_constraint, only_positive_bv, only_positive_bv_wsum +from cpmpy.transformations.linearize import decompose_linear, linearize_constraint, only_positive_bv, only_positive_bv_wsum from cpmpy.transformations.normalize import toplevel_list from cpmpy.transformations.reification import only_bv_reifies, only_implies, reify_rewrite from cpmpy.expressions.utils import is_any_list, is_num @@ -302,13 +302,16 @@ def _make_numexpr(self, cpm_expr): return self.solver_var(cpm_expr) # sum - if cpm_expr.name == "sum": + if hasattr(cpm_expr, "name") and cpm_expr.name == "sum": return scip.quicksum(self.solver_vars(cpm_expr.args)) - if cpm_expr.name == "sub": + if hasattr(cpm_expr, "name") and cpm_expr.name == "sub": a,b = self.solver_vars(cpm_expr.args) return a - b + if hasattr(cpm_expr, "name") and cpm_expr.name == "abs": + (a,) = self.solver_vars(cpm_expr.args) + return abs(a) # wsum - if cpm_expr.name == "wsum": + if hasattr(cpm_expr, "name") and cpm_expr.name == "wsum": return scip.quicksum(w * self.solver_var(var) for w, var in zip(*cpm_expr.args)) raise NotImplementedError("scip: Not a known supported numexpr {}".format(cpm_expr)) @@ -335,10 +338,15 @@ def transform(self, cpm_expr): cpm_cons = toplevel_list(cpm_expr) cpm_cons = no_partial_functions(cpm_cons, safen_toplevel={"mod", "div", "element"}) - cpm_cons = decompose_in_tree(cpm_cons, - supported=self.supported_global_constraints | {"alldifferent"}, - supported_reified=self.supported_reified_global_constraints, - csemap=self._csemap) + # Use the same linear-first decomposition as MIP solver backends. + # This ensures globals such as alldifferent are decomposed with their + # specialized linear decompositions before linearize_constraint(). + cpm_cons = decompose_linear( + cpm_cons, + supported=self.supported_global_constraints, + supported_reified=self.supported_reified_global_constraints, + csemap=self._csemap, + ) cpm_cons = flatten_constraint(cpm_cons, csemap=self._csemap) cpm_cons = reify_rewrite(cpm_cons, supported=frozenset(['sum', 'wsum', 'sub']), csemap=self._csemap) cpm_cons = only_numexpr_equality(cpm_cons, supported=frozenset(["sum", "wsum", "sub"]) | self.supported_global_constraints, csemap=self._csemap) @@ -400,8 +408,9 @@ def add(self, cpm_expr_orig): sciplhs = self._make_numexpr(lhs) self.scip_model.addCons(sciplhs >= sciprhs, name=self._get_constraint_name()) elif cpm_expr.name == '==': + lhs_name = getattr(lhs, "name", None) if isinstance(lhs, _NumVarImpl) \ - or (isinstance(lhs, Operator) and (lhs.name == 'sum' or lhs.name == 'wsum' or lhs.name == "sub")): + or (lhs_name in {"sum", "wsum", "sub", "abs"}): # a BoundedLinearExpression LHS, special case, like in objective sciplhs = self._make_numexpr(lhs) self.scip_model.addCons(sciplhs == sciprhs, name=self._get_constraint_name()) @@ -480,18 +489,25 @@ def add(self, cpm_expr_orig): __add__ = add -def _to_writer(model: cp.Model, problem_name: Optional[str] = None) -> _SCIPWriter: +def _to_writer(model: cp.Model, problem_name: Optional[str] = None, require_objective: bool = False) -> _SCIPWriter: """ Convert a CPMpy model to a SCIP writer + + Arguments: + model: CPMpy model + problem_name: Optional name for the problem + require_objective: If True, raise an error if model has no objective. + If False, allow satisfaction problems (no objective). """ writer = _SCIPWriter(problem_name=problem_name) # 1) post constraints for constraint in model.constraints: writer += constraint - # 2) post objective - if not model.has_objective(): + # 2) post objective (if present) + if model.has_objective(): + writer.objective(model.objective_, model.objective_is_min) + elif require_objective: raise ValueError("Model has no objective function") - writer.objective(model.objective_, model.objective_is_min) return writer @@ -557,7 +573,7 @@ def write_scip(model: cp.Model, fname: Optional[str] = None, format: str = "mps" - "mps" - "lp" - "cip" - - "fzn" + - "fzn" (supports both satisfaction and optimization problems) - "gms" - "pip" @@ -565,7 +581,10 @@ def write_scip(model: cp.Model, fname: Optional[str] = None, format: str = "mps" For more information, see the SCIP documentation: https://pyscipopt.readthedocs.io/en/latest/tutorials/readwrite.html """ - writer = _to_writer(model, problem_name="CPMpy Model") + # FZN format supports satisfaction problems (no objective), others may require it + #require_obj = format != "fzn" + require_obj = False + writer = _to_writer(model, problem_name="CPMpy Model", require_objective=require_obj) # Decide where to write if fname is None: From e1273116603fd34eb81ea7226a383d734e6c2d1b Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 6 Mar 2026 13:50:59 +0100 Subject: [PATCH 139/152] path rename --- docs/api/tools.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/tools.rst b/docs/api/tools.rst index 55d1706f3..06eb5c7f5 100644 --- a/docs/api/tools.rst +++ b/docs/api/tools.rst @@ -9,7 +9,7 @@ Tools (:mod:`cpmpy.tools`) :maxdepth: 1 :caption: Tools: - tools/dataset + tools/datasets tools/readers tools/writers tools/benchmarks From 81b88a3631b34a338fb6fb232d052d8ccae58f69 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 6 Mar 2026 14:06:16 +0100 Subject: [PATCH 140/152] Move dimacs --- cpmpy/tools/datasets/sat.py | 2 +- cpmpy/tools/datasets/transforms.py | 4 +- cpmpy/tools/dimacs.py | 139 ++-------------------------- cpmpy/tools/io/dimacs.py | 142 +++++++++++++++++++++++++++++ cpmpy/tools/io/reader.py | 2 +- cpmpy/tools/io/writer.py | 2 +- 6 files changed, 154 insertions(+), 137 deletions(-) create mode 100644 cpmpy/tools/io/dimacs.py diff --git a/cpmpy/tools/datasets/sat.py b/cpmpy/tools/datasets/sat.py index 991cf05af..62534159e 100644 --- a/cpmpy/tools/datasets/sat.py +++ b/cpmpy/tools/datasets/sat.py @@ -77,7 +77,7 @@ def __init__( @staticmethod def _loader(content: str): - from cpmpy.tools.dimacs import load_dimacs + from cpmpy.tools.io.dimacs import load_dimacs with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".cnf") as tmp: tmp.write(content) tmp_path = tmp.name diff --git a/cpmpy/tools/datasets/transforms.py b/cpmpy/tools/datasets/transforms.py index 2fab520be..fe112ceb4 100644 --- a/cpmpy/tools/datasets/transforms.py +++ b/cpmpy/tools/datasets/transforms.py @@ -336,7 +336,7 @@ class Serialize: ... ]) >>> # Using writer function directly - >>> from cpmpy.tools.dimacs import write_dimacs + >>> from cpmpy.tools.io.dimacs import write_dimacs >>> transform = Compose([ ... Load(load_wcnf, open=dataset.open), ... Serialize(write_dimacs), @@ -394,7 +394,7 @@ class Translate: >>> dimacs_string, metadata = dataset[0] >>> # Using writer function directly - >>> from cpmpy.tools.dimacs import write_dimacs + >>> from cpmpy.tools.io.dimacs import write_dimacs >>> transform = Translate(dataset.loader, write_dimacs, open=dataset.open) >>> dataset = MSEDataset(transform=transform) >>> dimacs_string, metadata = dataset[0] diff --git a/cpmpy/tools/dimacs.py b/cpmpy/tools/dimacs.py index 534c5d134..1ddd4fbdb 100644 --- a/cpmpy/tools/dimacs.py +++ b/cpmpy/tools/dimacs.py @@ -1,141 +1,16 @@ #!/usr/bin/env python #-*- coding:utf-8 -*- ## -## dimacs.py +## dimacs.py (re-export from cpmpy.tools.io.dimacs) ## """ - This file implements helper functions for exporting CPMpy models from and to DIMACS format. - DIMACS is a textual format to represent CNF problems. - The header of the file should be formatted as ``p cnf ``. - If the number of variables and constraints are not given, it is inferred by the parser. + DIMACS read/write support. - Each remaining line of the file is formatted as a list of integers. - An integer represents a Boolean variable and a negative Boolean variable is represented using a `'-'` sign. + This module re-exports from :mod:`cpmpy.tools.io.dimacs` for backward + compatibility. New code should import from ``cpmpy.tools.io`` or + ``cpmpy.tools.io.dimacs``. """ -import cpmpy as cp - -from cpmpy.expressions.variables import _BoolVarImpl, NegBoolView -from cpmpy.expressions.core import Operator, Comparison - -from cpmpy.transformations.normalize import toplevel_list -from cpmpy.transformations.to_cnf import to_cnf -from cpmpy.transformations.get_variables import get_variables - -import re -from typing import Optional - -def write_dimacs(model, fname=None, encoding="auto", header:Optional[str]="DIMACS file written by CPMpy"): - """ - Writes CPMpy model to DIMACS format - Uses the "to_cnf" transformation from CPMpy - - .. todo:: - TODO: implement pseudoboolean constraints in to_cnf - - :param model: a CPMpy model - :param fname: optional, file name to write the DIMACS output to - :param encoding: the encoding used for `int2bool`, choose from ("auto", "direct", "order", or "binary") - """ - - if model.has_objective(): - raise ValueError("DIMACS format does not support objective functions") - - constraints = toplevel_list(model.constraints) - constraints = to_cnf(constraints, encoding=encoding) - - vars = get_variables(constraints) - mapping = {v : i+1 for i, v in enumerate(vars)} - - out = f"p cnf {len(vars)} {len(constraints)}\n" - for cons in constraints: - - if isinstance(cons, _BoolVarImpl): - cons = Operator("or", [cons]) - - if not (isinstance(cons, Operator) and cons.name == "or"): - raise NotImplementedError(f"Unsupported constraint {cons}") - - # write clause to cnf format - ints = [] - for v in cons.args: - if isinstance(v, NegBoolView): - ints.append(str(-mapping[v._bv])) - elif isinstance(v, _BoolVarImpl): - ints.append(str(mapping[v])) - else: - raise ValueError(f"Expected Boolean variable in clause, but got {v} which is of type {type(v)}") - - out += " ".join(ints + ["0"]) + "\n" - - if header is not None: - header_lines = ["c " + line for line in header.splitlines()] - out = "\n".join(header_lines) + "\n" + out - - if fname is not None: - with open(fname, "w") as f: - f.write(out) - - return out - - -def load_dimacs(fname): - """ - Load a CPMpy model from a DIMACS formatted file strictly following the specification: - https://web.archive.org/web/20190325181937/https://www.satcompetition.org/2009/format-benchmarks2009.html - - .. note:: - The p-line has to denote the correct number of variables and clauses - - :param fname: the name of the DIMACS file - :param sep: optional, separator used in the DIMACS file, will try to infer if None - """ - - m = cp.Model() - - with open(fname, "r") as f: - clause = [] - nr_vars = None - for line in f.readlines(): - if line == "" or line.startswith("c"): - continue # skip empty and comment lines - elif line.startswith("p"): - params = line.strip().split(" ") - assert len(params) == 4, f"Expected p-header to be formed `p cnf nr_vars nr_cls` but got {line}" - _,typ,nr_vars,nr_cls = params - if typ != "cnf": - raise ValueError("Expected `cnf` (i.e. DIMACS) as file format, but got {typ} which is not supported.") - nr_vars = int(nr_vars) - if nr_vars>0: - bvs = cp.boolvar(shape=nr_vars) - nr_cls = int(nr_cls) - else: - assert nr_vars is not None, "Expected p-line before first clause" - for token in line.strip().split(): - i = int(token.strip()) - if i == 0: - m += cp.any(clause) - clause = [] - else: - var=abs(i)-1 - assert var < nr_vars, "Expected at most {nr_vars} variables (from p-line) but found literal {i} in clause {line}" - bv = bvs[var] - - clause.append(bv if i > 0 else ~bv) - - assert nr_vars is not None, "Expected file to contain p-line, but did not" - assert len(clause) == 0, f"Expected last clause to be terminated by 0, but it was not" - assert len(m.constraints) == nr_cls, f"Number of clauses was declared in p-line as {nr_cls}, but was {len(m.constraints)}" - - return m - -# Backward compatibility alias -read_dimacs = load_dimacs - -# Backward compatibility alias -read_dimacs = load_dimacs - - - - +from cpmpy.tools.io.dimacs import load_dimacs, read_dimacs, write_dimacs +__all__ = ["load_dimacs", "read_dimacs", "write_dimacs"] diff --git a/cpmpy/tools/io/dimacs.py b/cpmpy/tools/io/dimacs.py new file mode 100644 index 000000000..3f16776f3 --- /dev/null +++ b/cpmpy/tools/io/dimacs.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +#-*- coding:utf-8 -*- +## +## dimacs.py +## +""" + This file implements helper functions for exporting CPMpy models from and to DIMACS format. + DIMACS is a textual format to represent CNF problems. + The header of the file should be formatted as ``p cnf ``. + If the number of variables and constraints are not given, it is inferred by the parser. + + Each remaining line of the file is formatted as a list of integers. + An integer represents a Boolean variable and a negative Boolean variable is represented using a `'-'` sign. +""" + +import cpmpy as cp + +from cpmpy.expressions.variables import _BoolVarImpl, NegBoolView +from cpmpy.expressions.core import Operator, Comparison + +from cpmpy.transformations.normalize import toplevel_list +from cpmpy.transformations.to_cnf import to_cnf +from cpmpy.transformations.get_variables import get_variables + +import re +from typing import Optional, Callable +import builtins + +def write_dimacs(model, fname=None, encoding="auto", header:Optional[str]="DIMACS file written by CPMpy", open: Optional[Callable]=None): + """ + Writes CPMpy model to DIMACS format + Uses the "to_cnf" transformation from CPMpy + + .. todo:: + TODO: implement pseudoboolean constraints in to_cnf + + :param model: a CPMpy model + :param fname: optional, file name to write the DIMACS output to + :param encoding: the encoding used for `int2bool`, choose from ("auto", "direct", "order", or "binary") + :param open: optional callable to open the file for writing (default: builtin ``open``). + Called as ``open(fname, "w")``. This mirrors the ``open=`` argument + in loaders and allows custom compression or I/O (e.g. + ``lambda p, mode='w': lzma.open(p, 'wt')``). + """ + + if model.has_objective(): + raise ValueError("DIMACS format does not support objective functions") + + constraints = toplevel_list(model.constraints) + constraints = to_cnf(constraints, encoding=encoding) + + vars = get_variables(constraints) + mapping = {v : i+1 for i, v in enumerate(vars)} + + out = f"p cnf {len(vars)} {len(constraints)}\n" + for cons in constraints: + + if isinstance(cons, _BoolVarImpl): + cons = Operator("or", [cons]) + + if not (isinstance(cons, Operator) and cons.name == "or"): + raise NotImplementedError(f"Unsupported constraint {cons}") + + # write clause to cnf format + ints = [] + for v in cons.args: + if isinstance(v, NegBoolView): + ints.append(str(-mapping[v._bv])) + elif isinstance(v, _BoolVarImpl): + ints.append(str(mapping[v])) + else: + raise ValueError(f"Expected Boolean variable in clause, but got {v} which is of type {type(v)}") + + out += " ".join(ints + ["0"]) + "\n" + + if header is not None: + header_lines = ["c " + line for line in header.splitlines()] + out = "\n".join(header_lines) + "\n" + out + + if fname is not None: + opener = open if open is not None else builtins.open + with opener(fname, "w") as f: + f.write(out) + + return out + + +def load_dimacs(fname, open=None): + """ + Load a CPMpy model from a DIMACS formatted file strictly following the specification: + https://web.archive.org/web/20190325181937/https://www.satcompetition.org/2009/format-benchmarks2009.html + + .. note:: + The p-line has to denote the correct number of variables and clauses + + :param fname: the name of the DIMACS file + :param open: optional callable to open the file for reading (default: builtin ``open``). + Use for decompression, e.g. ``lambda p: lzma.open(p, 'rt')`` for ``.cnf.xz``. + """ + if open is None: + open = builtins.open + + m = cp.Model() + + with open(fname, "r") as f: + clause = [] + nr_vars = None + for line in f.readlines(): + if line == "" or line.startswith("c"): + continue # skip empty and comment lines + elif line.startswith("p"): + params = line.strip().split(" ") + assert len(params) == 4, f"Expected p-header to be formed `p cnf nr_vars nr_cls` but got {line}" + _,typ,nr_vars,nr_cls = params + if typ != "cnf": + raise ValueError("Expected `cnf` (i.e. DIMACS) as file format, but got {typ} which is not supported.") + nr_vars = int(nr_vars) + if nr_vars>0: + bvs = cp.boolvar(shape=nr_vars) + nr_cls = int(nr_cls) + else: + assert nr_vars is not None, "Expected p-line before first clause" + for token in line.strip().split(): + i = int(token.strip()) + if i == 0: + m += cp.any(clause) + clause = [] + else: + var=abs(i)-1 + assert var < nr_vars, "Expected at most {nr_vars} variables (from p-line) but found literal {i} in clause {line}" + bv = bvs[var] + + clause.append(bv if i > 0 else ~bv) + + assert nr_vars is not None, "Expected file to contain p-line, but did not" + assert len(clause) == 0, f"Expected last clause to be terminated by 0, but it was not" + assert len(m.constraints) == nr_cls, f"Number of clauses was declared in p-line as {nr_cls}, but was {len(m.constraints)}" + + return m + +# Backward compatibility alias +read_dimacs = load_dimacs diff --git a/cpmpy/tools/io/reader.py b/cpmpy/tools/io/reader.py index d25df69a5..4f60b1928 100644 --- a/cpmpy/tools/io/reader.py +++ b/cpmpy/tools/io/reader.py @@ -15,7 +15,7 @@ from typing import Callable, List, Optional import cpmpy as cp -from cpmpy.tools.dimacs import load_dimacs +from .dimacs import load_dimacs from cpmpy.tools.io.scip import load_scip from cpmpy.tools.io.wcnf import load_wcnf from cpmpy.tools.io.opb import load_opb diff --git a/cpmpy/tools/io/writer.py b/cpmpy/tools/io/writer.py index d6e18c441..f6bd4ddd8 100644 --- a/cpmpy/tools/io/writer.py +++ b/cpmpy/tools/io/writer.py @@ -21,7 +21,7 @@ from functools import partial import cpmpy as cp -from cpmpy.tools.dimacs import write_dimacs +from .dimacs import write_dimacs from cpmpy.tools.io.scip import write_scip from cpmpy.tools.io.opb import write_opb from cpmpy.tools.io.utils import get_format From 3bd4db869bd40ef6bcc1d5e8836ed53cedccfc43 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 6 Mar 2026 14:06:53 +0100 Subject: [PATCH 141/152] Fix imports --- cpmpy/tools/datasets/transforms.py | 2 +- cpmpy/tools/io/__init__.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cpmpy/tools/datasets/transforms.py b/cpmpy/tools/datasets/transforms.py index fe112ceb4..d2d0b17d0 100644 --- a/cpmpy/tools/datasets/transforms.py +++ b/cpmpy/tools/datasets/transforms.py @@ -515,7 +515,7 @@ def enrich_metadata(self, data, metadata): if not self.write_metadata or self._last_path is None: return metadata - from cpmpy.tools.dataset._base import portable_instance_metadata + from cpmpy.tools.datasets.utils import portable_instance_metadata sidecar = {} diff --git a/cpmpy/tools/io/__init__.py b/cpmpy/tools/io/__init__.py index 6c7af0acb..af46d53c3 100644 --- a/cpmpy/tools/io/__init__.py +++ b/cpmpy/tools/io/__init__.py @@ -19,6 +19,7 @@ # Model datasets from .opb import load_opb, read_opb, write_opb # read_opb is alias +from .dimacs import load_dimacs, read_dimacs, write_dimacs # read_dimacs is alias from .scip import load_scip, read_scip, write_scip # read_scip is alias from .wcnf import load_wcnf, read_wcnf # read_wcnf is alias from .xcsp3 import load_xcsp3 \ No newline at end of file From 94a02112518b291c73027c39135f65a738066826 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 6 Mar 2026 14:07:07 +0100 Subject: [PATCH 142/152] Writer compression option --- cpmpy/tools/io/scip.py | 72 +++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/cpmpy/tools/io/scip.py b/cpmpy/tools/io/scip.py index a245669cb..20e3924b7 100644 --- a/cpmpy/tools/io/scip.py +++ b/cpmpy/tools/io/scip.py @@ -39,7 +39,7 @@ import cpmpy as cp import warnings -from typing import Union, Optional +from typing import Union, Optional, Callable from cpmpy.expressions.core import BoolVal, Comparison, Operator from cpmpy.expressions.variables import _NumVarImpl, _BoolVarImpl, NegBoolView, _IntVarImpl @@ -566,7 +566,7 @@ def _add_header(fname: os.PathLike, format: str, header: Optional[str] = None): f.writelines(lines) -def write_scip(model: cp.Model, fname: Optional[str] = None, format: str = "mps", header: Optional[str] = None, verbose: bool = False) -> str: +def write_scip(model: cp.Model, fname: Optional[str] = None, format: str = "mps", header: Optional[str] = None, verbose: bool = False, open: Optional[Callable] = None) -> str: """ Write a CPMpy model to file using a SCIP provided writer. Supported formats include: @@ -579,41 +579,55 @@ def write_scip(model: cp.Model, fname: Optional[str] = None, format: str = "mps" More formats can be supported upon the installation of additional dependencies (like SIMPL). For more information, see the SCIP documentation: https://pyscipopt.readthedocs.io/en/latest/tutorials/readwrite.html + + Arguments: + model: CPMpy model to write. + fname: Path to write to. If None, the file content is returned as a string. + format: Output format (e.g. "mps", "lp", "cip", "fzn", "gms", "pip"). + header: Optional header text to prepend (format-dependent comment style). + verbose: If True, allow SCIP to print progress. + open: Optional callable to open the file for writing (default: builtin ``open``). + Called as ``open(fname, "w")``. Mirrors the ``open=`` argument in loaders and + allows custom compression or I/O (e.g. + ``lambda p, mode='w': lzma.open(p, 'wt')``). + + Returns: + The file content as a string (whether written to ``fname`` or not). """ # FZN format supports satisfaction problems (no objective), others may require it #require_obj = format != "fzn" require_obj = False writer = _to_writer(model, problem_name="CPMpy Model", require_objective=require_obj) - - # Decide where to write - if fname is None: - with tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False) as tmp: - fname = tmp.name - try: + + opener = open if open is not None else _std_open + + # Always write via SCIP to a temp file, then add header and get content + with tempfile.NamedTemporaryFile(suffix=f".{format}", delete=False) as tmp: + tmp_fname = tmp.name + try: + if not verbose: writer.scip_model.hideOutput() - # Suppress SCIP's C-level "wrote problem to file" message - devnull = os.open(os.devnull, os.O_WRONLY) - old_stdout = os.dup(1) - os.dup2(devnull, 1) - try: - writer.scip_model.writeProblem(fname) - finally: - os.dup2(old_stdout, 1) - os.close(devnull) - os.close(old_stdout) - _add_header(fname, format, header) - with open(fname, "r") as f: - return f.read() + devnull = os.open(os.devnull, os.O_WRONLY) + old_stdout = os.dup(1) + os.dup2(devnull, 1) + try: + writer.scip_model.writeProblem(tmp_fname, verbose=verbose) finally: - os.remove(fname) - else: - if not verbose: writer.scip_model.hideOutput() - writer.scip_model.writeProblem(fname, verbose=verbose) - if not verbose: writer.scip_model.hideOutput(quiet=False) - _add_header(fname, format, header) - with open(fname, "r") as f: - return f.read() + os.dup2(old_stdout, 1) + os.close(devnull) + os.close(old_stdout) + if not verbose: + writer.scip_model.hideOutput(quiet=False) + _add_header(tmp_fname, format, header) + with _std_open(tmp_fname, "r") as f: + content = f.read() + if fname is not None: + with opener(fname, "w") as f: + f.write(content) + return content + finally: + os.remove(tmp_fname) def main(): parser = argparse.ArgumentParser(description="Parse and solve a SCIP compatible model using CPMpy") From e0191558f3dd09ec3a13e7809bc76cf7f0768930 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 6 Mar 2026 17:29:23 +0100 Subject: [PATCH 143/152] Small fixes to metadata --- cpmpy/tools/datasets/core.py | 4 ++++ cpmpy/tools/datasets/metadata.py | 4 ++-- cpmpy/tools/datasets/transforms.py | 8 ++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpmpy/tools/datasets/core.py b/cpmpy/tools/datasets/core.py index f20000a76..b9a7fcc77 100644 --- a/cpmpy/tools/datasets/core.py +++ b/cpmpy/tools/datasets/core.py @@ -111,6 +111,7 @@ from .metadata import FeaturesInfo, DatasetInfo, InstanceInfo from .utils import extract_model_features, portable_instance_metadata +from .transforms import _enrich_from_model # tqdm as an optional dependency, provides prettier progress bars try: @@ -859,6 +860,9 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: # Let transforms contribute to metadata (e.g. model verification info) if hasattr(self.transform, 'enrich_metadata'): metadata = self.transform.enrich_metadata(filename, metadata) + elif isinstance(filename, cp.Model): + # Transform returned a CPMpy model (e.g. dataset.load); enrich from model + metadata = _enrich_from_model(filename, metadata) return filename, metadata diff --git a/cpmpy/tools/datasets/metadata.py b/cpmpy/tools/datasets/metadata.py index b6d0574e4..82f804cbd 100644 --- a/cpmpy/tools/datasets/metadata.py +++ b/cpmpy/tools/datasets/metadata.py @@ -411,7 +411,7 @@ def model_objects(self) -> dict: Currently contains: - - ``decision_variables``: ``{name: CPMpy_variable}`` mapping for every + - ``variables``: ``{name: CPMpy_variable}`` mapping for every decision variable in the loaded model. These objects are **not JSON-serialisable** and are excluded from @@ -423,7 +423,7 @@ def model_objects(self) -> dict: dataset.transform = Load(dataset.loader, open=dataset.open) model, info = dataset[0] - vars = info.model_objects["decision_variables"] + vars = info.model_objects["variables"] model.solve() print({name: v.value() for name, v in vars.items()}) """ diff --git a/cpmpy/tools/datasets/transforms.py b/cpmpy/tools/datasets/transforms.py index d2d0b17d0..0dd253f77 100644 --- a/cpmpy/tools/datasets/transforms.py +++ b/cpmpy/tools/datasets/transforms.py @@ -142,7 +142,7 @@ def metadata_from_model(model): This is called by transforms that produce CPMpy models (Load, Translate) via their ``enrich_metadata`` method. It adds: - - ``decision_variables``: list of dicts with name, type, lb, ub for each variable + - ``variables``: ``{name: CPMpy_variable}`` mapping for every decision variable - ``objective``: string representation of the objective expression (if any) - ``objective_is_min``: True if minimizing, False if maximizing (if any) """ @@ -156,7 +156,7 @@ def metadata_from_model(model): from cpmpy.expressions.variables import _BoolVarImpl variables = get_variables_model(model) - metadata['decision_variables'] = { + metadata['variables'] = { v.name: v for v in variables } @@ -280,7 +280,7 @@ class Load: >>> from cpmpy.tools.io.wcnf import load_wcnf >>> dataset = MSEDataset(transform=Load(load_wcnf, open=dataset.open)) >>> model, metadata = dataset[0] - >>> metadata['decision_variables'] # list of variable descriptors + >>> metadata['variables'] # list of variable descriptors >>> metadata['objective'] # objective expression string (if any) """ @@ -398,7 +398,7 @@ class Translate: >>> transform = Translate(dataset.loader, write_dimacs, open=dataset.open) >>> dataset = MSEDataset(transform=transform) >>> dimacs_string, metadata = dataset[0] - >>> metadata['decision_variables'] # from the intermediate model + >>> metadata['variables'] # from the intermediate model """ def __init__(self, loader, writer, open=None, **kwargs): From aa1303a8945e558433e9cfa45926df781693cf29 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 6 Mar 2026 17:29:58 +0100 Subject: [PATCH 144/152] Start of docs --- docs/instance_metadata.md | 695 ++++++++++++++++++++++++++ docs/reading_and_writing.md | 968 ++++++++++++++++++++++++++++++++++++ 2 files changed, 1663 insertions(+) create mode 100644 docs/instance_metadata.md create mode 100644 docs/reading_and_writing.md diff --git a/docs/instance_metadata.md b/docs/instance_metadata.md new file mode 100644 index 000000000..bbbb6a145 --- /dev/null +++ b/docs/instance_metadata.md @@ -0,0 +1,695 @@ +# Instance Metadata + +When running experiments on benchmark sets, the raw instances are rarely enough. +You need to know *what* you are solving: how many variables, what domain sizes, +whether an optimal solution is known, what the structure looks like. You also +want to carry that information through transformations — if you translate an +instance from XCSP3 to OPB, the number of jobs in the original scheduling +problem should still be attached to the result. And when you finally save the +results of a batch run to a CSV or a database, you want a clean, predictable +record structure. + +CPMpy's metadata system is built around a single class — `InstanceInfo` — that +addresses all of these concerns at once. This page explains the system from +first principles, starting with the simplest usage (it's just a dict) and +gradually introducing more powerful features. + +--- + +## Level 1 — It's just a dict + +`InstanceInfo` inherits directly from Python's `dict`. + +```python +from cpmpy.tools.datasets import JSPLibDataset + +dataset = JSPLibDataset(root="./data", download=True) +file_path, info = dataset[0] + +# Standard dict access — always works, nothing to learn +info["name"] # "abz5" +info["jobs"] # 10 +info.get("optimum") # 1234 (or None if not recorded for this instance) +"machines" in info # True +list(info.keys()) # ["dataset", "name", "category", "jobs", "machines", …] + +for key, value in info.items(): + print(key, "=", value) +``` + +If you only need a handful of fields and don't care about structure, stop here. +The rest of this page describes what the metadata system adds on top. + +--- + +## Level 2 — Structured properties + +A benchmark instance metadata dict contains many different kinds of information +mixed together: system bookkeeping fields like `"name"` and `"path"`, problem +parameters like `"jobs"` and `"machines"`, format-specific header statistics +like `"opb_num_variables"`, and (after loading) CP model statistics like +`"num_constraints"`. All of these coexist in the flat dict. + +`InstanceInfo` adds four read-only *properties* that partition that flat dict +into named groups. The data is not duplicated — the properties are computed +views over the same underlying dict: + +```python +file_path, info = dataset[0] + +info.domain_metadata # {"jobs": 10, "machines": 10, "optimum": 1234, …} +info.format_metadata # {"opb_num_variables": 42, "opb_num_constraints": 30} +info.model_features # {"num_variables": 100, "num_constraints": 47, …} +info.model_objects # {"variables": {"start_0_0": IntVar(…), …}} +``` + +The four partitions and what belongs in each: + +| Property | Contents | Serializable | +|----------|----------|:---:| +| `domain_metadata` | Problem-level, format-independent fields: `jobs`, `machines`, `horizon`, `num_staff`, `optimum`, … | ✅ | +| `format_metadata` | Format-specific header fields, prefixed by format: `opb_*`, `wcnf_*`, `mps_*`, `xcsp_*`, `dimacs_*` | ✅ | +| `model_features` | CP model statistics computed from the parsed model: variable counts, constraint counts, domain sizes, objective info | ✅ | +| `model_objects` | Live Python objects added by `Load`: the `variables` name→variable map — **in-memory only** | ❌ | + +The distinction matters in practice. Consider translating an OPB instance to +SAT (DIMACS format). The `domain_metadata` fields — problem-level parameters +such as the number of variables and constraints — describe the *problem* and +remain valid regardless of format. The `format_metadata` fields — +`opb_num_variables`, `opb_num_constraints` — describe the *file* and become +meaningless once the file is gone (replaced by `dimacs_*` for the new format). +The `model_features` describe the CPMpy model that was parsed from the file, +and may differ from the format statistics if, say, some variables were +simplified away during transformation. + +### The stable instance ID + +In addition to the four partitions, `InstanceInfo` provides a stable +slash-separated identifier built from the dataset name, any category labels +(year, track, variant), and the instance name: + +```python +info.id # "jsplib/abz5" + # "xcsp3/2024/CSP/AverageAvoiding-20_c24" + # "opb/miplib/aflow30b" +``` + +This `id` is designed to be unique and human-readable, making it +suitable as a primary key when storing results in a database, CSV, or +experiment log. It is included automatically in `to_croissant()` and `to_gbd()` +output. In the future, CPMpy may support **globally unique instance ID hashes** +as provided by the [Global Benchmark Database (GBD)](https://benchmark-database.de/); such hashes identify the same instance across collections and formats. For more on GBD’s instance identification and feature records, see the [GBD project](https://benchmark-database.de/) and the [SAT 2024 paper](https://doi.org/10.4230/LIPIcs.SAT.2024.18). + +--- + +## Level 3 — Adding your own fields + +The most common metadata operation is simply adding computed fields. You have +done your own analysis on an instance and want to attach the result alongside +the existing metadata so that everything travels together. + +### The `|` operator + +`InstanceInfo` overrides Python's dict merge operator `|` so that the result +is always a new `InstanceInfo`, not a plain dict. This means all structured +properties remain available after the merge: + +```python +for file_path, info in dataset: + enriched = info | { + "density": info["jobs"] / info["machines"], + "has_optimum": info.get("optimum") is not None, + } + + # The new fields are just dict keys: + print(enriched["density"]) + + # But structured properties still work on the merged result: + print(enriched.domain_metadata) # includes "density" and "has_optimum" + print(enriched.id) # unchanged +``` + +The original `info` is not modified; `|` always creates a new object. + +### Via `target_transform` + +If you want the enrichment to happen automatically on every single item — +without writing a loop — pass a `target_transform` to the dataset constructor. +It is called with each `InstanceInfo` after the main `transform` has run, and +its return value replaces the info for that iteration: + +```python +def add_difficulty(info): + jobs = info.get("jobs", 1) + machines = info.get("machines", 1) + return info | { + "density": jobs / machines, + "has_optimum": info.get("optimum") is not None, + } + +dataset = JSPLibDataset(root="./data", target_transform=add_difficulty) + +# Now every item in the loop already has the extra fields: +for file_path, info in dataset: + print(info["density"]) + print(info["has_optimum"]) +``` + +A `lambda` works equally well for simple one-liners: + +```python +dataset = JSPLibDataset( + root="./data", + target_transform=lambda info: info | {"density": info["jobs"] / info["machines"]}, +) +``` + +`target_transform` is the right place for lightweight, stateless computations +that depend only on what is already in the metadata — computing derived ratios, +renaming fields, filtering, or converting types. For computations that depend +on the actual file content or a loaded model, use a full `transform` or +`enrich_metadata` (see Level 6). + +--- + +## Level 4 — Handling format changes + +Suppose you have a dataset of OPB instances and you want to translate all of +them to SAT (DIMACS format). After the translation, the `format_metadata` fields that describe +the OPB file (`opb_num_variables`, `opb_num_constraints`, …) no longer +describe the file you are working with. Leaving them in the metadata is +misleading. At the same time, the `domain_metadata` fields — the problem-level +parameters that were true of the original instance — are still valid and should +be kept. + +`InstanceInfo.without_format()` solves this cleanly: it returns a copy of the +metadata with all format-specific fields stripped, while preserving everything +else: + +```python +from cpmpy.tools.datasets.transforms import Translate, extract_format_metadata + +dataset = OPBDataset(root="./data", download=True) +dataset.transform = Translate(dataset.load, "dimacs", open=dataset.open) + +for dimacs_string, info in dataset: + # At this point info still has the old opb_* fields from the original file. + # Strip them and add the new dimacs_* fields extracted from the translated string: + new_info = info.without_format() | extract_format_metadata(dimacs_string, "dimacs") + + # Domain fields are carried forward untouched: + print(new_info["name"]) # ✅ still there + + # Old format fields are gone: + assert "opb_num_variables" not in new_info + + # New format fields are present: + print(new_info["dimacs_num_variables"]) # ✅ from the translated DIMACS string +``` + +`extract_format_metadata` parses the header of the output file to extract +format-specific statistics like variable and constraint counts. It currently +supports `"opb"`, `"dimacs"`, `"mps"`, and `"lp"`. Other formats (e.g. XCSP3, +WCNF) are not supported there because they do not have a simple line-based +header that can be parsed from a raw string — XCSP3 is XML, and WCNF shares +the DIMACS `p` line but is usually handled by the loader. + +**Alternatives for formats without header-based extraction:** + +- **`collect_instance_metadata(file)`** — In your dataset class, open the file, + parse as much as you need (e.g. the first few lines or the XML root), and + return a dict with format-prefixed keys (e.g. `xcsp_format`, `instance_type`). + The framework stores these in `format_metadata`. See e.g. `XCSP3Dataset.collect_instance_metadata` + in the codebase, which reads the XCSP3 XML header to set `xcsp_format` and + `instance_type`. + +- **`Load` + `model_features`** — After a `Load` transform, the framework + fills `model_features` from the parsed CPMpy model (`num_variables`, + `num_constraints`, etc.). That gives you portable instance statistics + regardless of file format, without implementing format-specific header parsing. + +If you only want to strip the old format fields without adding new ones — for +example, because you are translating to a format whose header has no useful +statistics — just call `without_format()` on its own: + +```python +stripped = info.without_format() +assert not stripped.format_metadata # empty +assert stripped["jobs"] == info["jobs"] # domain fields intact +``` + +The chain pattern — `without_format() | {...}` — is intentionally modelled +after Python's own dict merge so that it feels natural and composes well with +any additional fields you want to attach at the same time. + +--- + +## Level 5 — Model objects and printing solution values + +When you set the dataset's transform to the dataset's loader (e.g. +`dataset.transform = dataset.load`), each instance is loaded into a CPMpy +model and that model is the iteration value. But where do the *variable names* +go? The model holds CPMpy variable objects, but the connection between a +variable's string name and its object is not always easy to recover after the +fact. + +The framework fills `model_objects["variables"]` with a `{name: CPMpy_variable}` +mapping whenever the transform returns a CPMpy model. After solving, you can +read every variable's value by name, without holding any separate reference to +the variable objects: + +```python +from cpmpy.tools.datasets import JSPLibDataset + +dataset = JSPLibDataset(root="./data", download=True) +dataset.transform = dataset.load + +for model, info in dataset: + print(f"Solving {info['name']} ({info['jobs']}×{info['machines']})…") + if model.solve(): + print(f" Optimal makespan: {model.objective_value()}") + dvars = info.model_objects["variables"] + # Print start times for all tasks + for name, var in dvars.items(): + if name.startswith("start_"): + print(f" {name} = {var.value()}") + else: + print(" No solution found") +``` + +--- + +## Level 6 — Custom transforms with `enrich_metadata` + +So far, metadata enrichment has been done either after the fact (with `|` in +the loop) or via `target_transform` (which sees the final metadata but not the +transformed data). Sometimes you need to enrich metadata based on the *output* +of a transform — for example, computing file size after compression, or +extracting format statistics from a translated string. + +Any transform class can implement an optional `enrich_metadata(self, data, +metadata)` method. The dataset calls it automatically after `__call__` returns, +passing both the output of `__call__` and the current `InstanceInfo`. Whatever +`enrich_metadata` returns becomes the new metadata for that item. + +### Example: annotating with file size + +```python +import os + +class AnnotateFileSize: + """Passes the file path through unchanged, but records the file size.""" + + def __call__(self, file_path): + return file_path # data is unchanged + + def enrich_metadata(self, data, metadata): + # data is the file path (the return value of __call__) + return metadata | {"file_size_bytes": os.path.getsize(data)} + + +dataset = JSPLibDataset(root="./data") +dataset.transform = AnnotateFileSize() + +for file_path, info in dataset: + print(f"{info['name']}: {info['file_size_bytes']} bytes") +``` + +This is useful in benchmark studies where instance size is a predictor of +solver difficulty, or when you want to log instance sizes alongside solve times. + +### Example: format-changing transform + +When `__call__` produces output in a different format, `enrich_metadata` is +the right place to call `without_format()` and attach new format statistics. +This keeps the format bookkeeping inside the transform, so the calling code +stays clean: + +```python +from cpmpy.tools.datasets.transforms import Translate, extract_format_metadata + +class TranslateToOPB: + """Translates any instance to OPB, updating metadata to match.""" + + def __init__(self, loader, open): + self._translate = Translate(loader, "opb", open=open) + + def __call__(self, file_path): + # Perform the actual translation; return the OPB string + return self._translate(file_path) + + def enrich_metadata(self, data, metadata): + # data = OPB string (the return value of __call__) + # metadata = InstanceInfo as it stood before this step + # Strip old format fields; add new ones extracted from the OPB header + return metadata.without_format() | extract_format_metadata(data, "opb") + + +dataset = JSPLibDataset(root="./data") +dataset.transform = TranslateToOPB(dataset.load, open=dataset.open) + +for opb_string, info in dataset: + print(info["jobs"]) # domain field — carried forward automatically + print(info["opb_num_variables"]) # new format field — set by enrich_metadata +``` + +### Composing multiple transforms + +`Compose` chains transforms into a pipeline. Each step receives the output of +the previous one, and each step's `enrich_metadata` is called with the output +*that specific step* produced — so the metadata is built up incrementally: + +```python +from cpmpy.tools.datasets.transforms import Compose, Load, Serialize, SaveToFile + +dataset.transform = Compose([ + Load(dataset.load, open=dataset.open), + # ↑ file path → CPMpy model; enrich_metadata adds model_features and + # variables to the metadata + + Serialize("opb"), + # ↑ CPMpy model → OPB string; no enrich_metadata, so metadata unchanged + + SaveToFile("./out/", extension=".opb", write_metadata=True), + # ↑ OPB string → saved file path; writes .meta.json sidecar; + # enrich_metadata adds "output_path" to the metadata +]) + +for output_path, info in dataset: + # At this point info contains everything: domain fields, model features, + # and the output path — all accumulated across the three steps. + print(f"Saved {info['name']} to {output_path}") + print(f" Variables: {info['num_variables']}, Constraints: {info['num_constraints']}") +``` + +The `write_metadata=True` flag in `SaveToFile` causes a `.meta.json` sidecar +file to be written alongside each output file. The sidecar contains everything +from `domain_metadata`, `format_metadata`, and `model_features` — enough to +reconstruct the full context of the instance without re-running the pipeline. + +--- + +## Level 7 — Declaring a metadata schema + +When you publish a dataset, you want to document what metadata fields it +provides, what their types are, and which instances might not have a value for +a given field. CPMpy provides two classes for this: `FieldInfo` describes a +single field; `FeaturesInfo` collects all fields for a dataset. + +### Declaring fields + +```python +from cpmpy.tools.datasets.metadata import FeaturesInfo, FieldInfo + +class MyDataset(FileDataset): + ... + features = FeaturesInfo({ + # Minimal shorthand: just (dtype, description) + "num_jobs": ("int", "Number of jobs in the instance"), + "num_machines": ("int", "Number of machines"), + + # Full FieldInfo when you need nullable or example: + "optimum": FieldInfo( + dtype = "int", + description = "Known optimal makespan, if available", + nullable = True, # some instances may not have a known optimum + example = 1234, + ), + "bounds": FieldInfo( + dtype = "dict", + description = "Lower and upper bounds on the makespan", + nullable = True, + example = {"lower": 800, "upper": 1500}, + ), + }) + + def collect_instance_metadata(self, file) -> dict: + """Called once per instance after download.""" + return { + "num_jobs": ..., + "num_machines": ..., + # "optimum" and "bounds" omitted here if not known — that's fine + } +``` + +The supported dtypes are `"int"`, `"float"`, `"str"`, `"bool"`, `"dict"`, +and `"list"`. Setting `nullable=True` (the default) means the field may be +absent or `None` for some instances — useful for fields like `"optimum"` that +are only known for a subset. + +`FeaturesInfo` also accepts several shorthand forms so that the common cases +are not verbose: + +```python +FeaturesInfo({"jobs": "int"}) # dtype only +FeaturesInfo({"jobs": ("int", "Job count")}) # dtype + description +FeaturesInfo({"jobs": FieldInfo("int", "Job count", nullable=False)}) # full +``` + +### Schema fields are optional + +Providing a full schema is **optional**. You can: + +- **Omit `features` entirely** — the dataset still works. Dataset cards and Croissant export still run but **omit the domain-field schema section** (no table of instance fields, no `cr:field` entries for domain fields in Croissant). +- **Use minimal declarations per field** — the framework coerces shorthand to `FieldInfo` with defaults for anything you leave out: + + | You provide | Defaults applied | + |-------------|------------------| + | `"jobs": "int"` (dtype only) | `description=""`, `nullable=True`, `example=None` | + | `"jobs": ("int", "Number of jobs")` | `nullable=True`, `example=None` | + | Full `FieldInfo(dtype, description, nullable=..., example=...)` | No defaults; you control each attribute | + +If you omit or simplify the schema you lose: + +- **No or partial field list** — cards and Croissant won't document your instance fields (or will show them with empty descriptions and "nullable: Yes" for everything). +- **No nullability signal** — consumers and tooling can't tell which fields are guaranteed to be present vs optional. +- **No example values** — documentation and generated cards won't show example values for fields. +- **Weaker typing for tooling** — anything that relies on `info.features` (e.g. validation, codegen, or exports) will have less precise type and description information. + +For a minimal dataset that only defines `name`, `description`, and `homepage`, skipping `features` is fine. For anything you intend to publish or integrate with cards/Croissant, defining at least `(dtype, description)` per field is recommended. + +### Schema inheritance + +When you subclass an existing dataset to add new fields, declare only the +*new* fields in `features`. The framework merges parent and child schemas +automatically via `__init_subclass__`, so card generation and Croissant export +always see the full combined schema: + +```python +class DifficultyJSPDataset(JSPLibDataset): + """JSPLib extended with a difficulty score computed from the known optimum.""" + + features = FeaturesInfo({ + "difficulty": FieldInfo( + "float", + "Estimated difficulty: known optimal makespan divided by the number of jobs", + nullable=True, + ), + }) + + def collect_instance_metadata(self, file) -> dict: + meta = super().collect_instance_metadata(file) # get all parent fields + jobs = meta.get("jobs", 1) + bound = meta.get("optimum") or (meta.get("bounds") or {}).get("upper") + if bound and jobs: + meta["difficulty"] = round(bound / jobs, 3) + return meta +``` + +After subclassing, the merged schema includes every parent field plus the new +one — without any extra code: + +```python +info = DifficultyJSPDataset.dataset_metadata() +list(info.features.fields) +# ['jobs', 'machines', 'optimum', 'bounds', 'instance_description', 'difficulty'] +``` + +You can also merge `FeaturesInfo` objects explicitly with `|`, which gives +identical results and is useful when you need to compose schemas from multiple +sources: + +```python +extra = FeaturesInfo({ + "difficulty": FieldInfo("float", "Hardness proxy"), + "cluster_id": FieldInfo("int", "Cluster assignment from k-means study"), +}) +class MyJSP(JSPLibDataset): + features = JSPLibDataset.features | extra +``` + +--- + +## Level 8 — Dataset-level metadata and interoperability + +### DatasetInfo + +Every dataset class exposes a `dataset_metadata()` classmethod that returns a +`DatasetInfo` object. Like `InstanceInfo`, `DatasetInfo` is a dict subclass +with structured properties on top. It is available without downloading anything: + +```python +info = JSPLibDataset.dataset_metadata() + +# Structured properties: +info.name # "jsplib" +info.homepage # "https://github.com/tamy0612/JSPLIB" +info.citation # ["J. Adams, E. Balas, D. Zawack. …"] +info.features # FeaturesInfo with the per-instance field schema + +# And as a dict: +info["name"] # "jsplib" (backward-compatible) + +# Inspect the field schema: +for name, fi in info.features.fields.items(): + nullable = " (optional)" if fi.nullable else "" + print(f" {name}: {fi.dtype}{nullable} — {fi.description}") +``` + +This is useful for tooling: you can enumerate all registered datasets, print +their metadata, and compare schemas without loading a single instance. + +### Dataset cards + +`card()` generates a human-readable summary in the HuggingFace Hub convention: +a YAML frontmatter block (for machine parsing) followed by a Markdown body +(for human reading). It includes the citations, the +full `features` schema, the standard CP model feature fields, etc: + +```python +print(JSPLibDataset.card()) +``` + +``` +--- +name: jsplib +homepage: https://github.com/tamy0612/JSPLIB +citation: + - "J. Adams, E. Balas, D. Zawack. The shifting bottleneck procedure for job shop scheduling. Management Science, 1988." +--- + +# jsplib Dataset + +A collection of Job Shop Scheduling benchmark instances. + +**Homepage:** https://github.com/tamy0612/JSPLIB + +## License + +MIT + +## Instance Features (Domain Metadata) + +| Field | Type | Nullable | Description | +|----------|------|----------|--------------------------------| +| `jobs` | int | Yes | Number of jobs | +| … | … | … | … | +… +``` + +Cards are useful for quickly documenting datasets in publications or READMEs. + +### Croissant JSON-LD (MLCommons) + +[Croissant](https://mlcommons.org/working-groups/data/croissant/) is the +MLCommons metadata standard for ML datasets, expressed as JSON-LD. CPMpy can +generate a compliant Croissant descriptor for any dataset, including the +per-instance field schema as a `cr:RecordSet`: + +```python +import json + +croissant = JSPLibDataset.dataset_metadata().to_croissant() +print(json.dumps(croissant, indent=2)) +``` + +```json +{ + "@context": {"@vocab": "https://schema.org/", "cr": "http://mlcommons.org/croissant/1.0"}, + "@type": "sc:Dataset", + "name": "jsplib", + "description": "…", + "url": "https://github.com/tamy0612/JSPLIB", + "cr:recordSet": [{ + "@type": "cr:RecordSet", + "name": "instances", + "cr:field": [ + {"@type": "cr:Field", "name": "id", "dataType": "sc:Text"}, + {"@type": "cr:Field", "name": "jobs", "dataType": "sc:Integer", "description": "Number of jobs"}, + … + ] + }] +} +``` + +Croissant descriptors are recognized by Google Dataset Search and other ML +infrastructure tooling. + +### Producing ML-ready records from instances + +Two adapter methods on `InstanceInfo` convert a single instance's metadata to +a flat, standard-format record: + +```python +file_path, info = dataset[0] + +# Croissant example record — id + domain fields + model features, flat dict +record = info.to_croissant() +# {"id": "jsplib/abz5", "jobs": 10, "machines": 10, "optimum": 1234, +# "num_variables": 100, "num_constraints": 47, …} + +# GBD (Global Benchmark Database) feature record +record = info.to_gbd() +# {"id": "jsplib/abz5", "filename": "abz5", "dataset": "jsplib", +# "jobs": 10, "machines": 10, …} +``` + +Both adapters exclude `format_metadata` (format-specific, not portable) and +`model_objects` (not serializable). They include `domain_metadata` and, if +available, `model_features`. + +You can use them as `target_transform` to have all instances automatically +converted on every iteration — useful when you are feeding the output directly +into a DataFrame or a database insert: + +```python +from cpmpy.tools.datasets.metadata import to_croissant + +dataset = JSPLibDataset( + root="./data", + transform=dataset.load, # populate model_features + target_transform=to_croissant, +) + +import pandas as pd +records = [record for _, record in dataset] +df = pd.DataFrame(records) +# Columns: id, jobs, machines, optimum, num_variables, num_constraints, … +print(df.describe()) +``` + +--- + +## Quick reference + +| What you want | How | +|---------------|-----| +| Read a field | `info["jobs"]` or `info.get("jobs", default)` | +| Iterate all fields | `for k, v in info.items()` | +| Stable instance ID | `info.id` | +| Problem-level fields only | `info.domain_metadata` | +| Format-specific fields | `info.format_metadata` | +| CP model statistics | `info.model_features` — populated after `Load` | +| Variable name → CPMpy var | `info.model_objects["variables"]` — after `Load` | +| Add a field in the loop | `enriched = info \| {"my_field": value}` | +| Add fields on every item automatically | `target_transform=lambda info: info \| {...}` | +| Strip stale format fields | `info.without_format()` | +| Strip old + add new format fields | `info.without_format() \| extract_format_metadata(data, "opb")` | +| Enrich from transform output | implement `enrich_metadata(self, data, metadata)` | +| Declare field schema | `FeaturesInfo({"field": ("dtype", "description")})` | +| Extend an existing schema | subclass and declare only new fields in `features` | +| Merge schemas explicitly | `FeaturesInfo_a \| FeaturesInfo_b` | +| Dataset-level metadata | `MyDataset.dataset_metadata()` | +| Dataset card (Markdown) | `MyDataset.card()` | +| Croissant descriptor (JSON-LD) | `MyDataset.dataset_metadata().to_croissant()` | +| ML-ready records per instance | `target_transform=to_croissant` or `to_gbd` | diff --git a/docs/reading_and_writing.md b/docs/reading_and_writing.md new file mode 100644 index 000000000..a986b1612 --- /dev/null +++ b/docs/reading_and_writing.md @@ -0,0 +1,968 @@ +# Reading, Writing and Datasets + +CPMpy provides a suite of tools for working with different file formats and benchmark sets +from the various communities within Constraint Optimization (CO). They enable simple +programmatic access to these resources and facilitate cross-community access to benchmarks +and systematic comparisons of solvers across paradigms. + +More concretely, we provide a set of readers, loaders (loading problem files into CPMpy model), +datasets, metadata, transformations, writers, etc, all to lower the barrier to entry to experiment +with these benchmarks. + +The dataset class that we provide is PyTorch compatible, allowing for integration within larger +systems within the scientific field. Whilst this tooling builds on top of CPMpy to provide the +transformation capabilities, its programmatic abstraction of CO benchmarks can be used in +combination with any (constraint modelling) system. + +This guide walks you through everything, from a simple one-liner for downloading instance files +to instructions on how to write your own dataset class. + +--- + +## Supported Formats + +| Format | Extension | Load | Write | Domain | +|--------|-----------|------|-------|--------| +| **OPB** | `.opb` | ✅ | ✅ | Pseudo-Boolean optimization | +| **WCNF** | `.wcnf` | ✅ | — | MaxSAT | +| **DIMACS** | `.cnf` | ✅ | ✅ | SAT | +| **MPS** | `.mps` | ✅ | ✅ | Mixed integer programming | +| **LP** | `.lp` | ✅ | ✅ | Linear/integer programming | +| **FZN** | `.fzn` | ✅ | ✅ | FlatZinc (MiniZinc) | +| **CIP** | `.cip` | ✅ | ✅ | Constraint integer programming | +| **GMS** | `.gms` | ✅ | ✅ | GAMS | +| **PIP** | `.pip` | ✅ | ✅ | Pseudo integer programming | +| **XCSP3** | `.xml` | ✅ | — | Constraint satisfaction/optimization | +| **JSPLib** | (none) | ✅ | — | Job Shop Scheduling | +| **PSPLib** | `.sm` | ✅ | — | Project Scheduling (RCPSP) | +| **NRP** | `.txt` | ✅ | — | Nurse Rostering | + +--- + +## Loading and Writing Files + +### Loading a file + +The `load` function auto-detects the format from the file extension: + +```python +from cpmpy.tools.io import load + +model = load("instance.opb") # format detected from extension +model = load("instance.cnf") +model = load("problem.mps") +``` +If the exension does not reveal the intended format, one can also manually provide it: + +```python +model = load("instance.txt", format="opb") +``` + +For format-specific control, use the dedicated loaders directly: + +```python +from cpmpy.tools.io.opb import load_opb +from cpmpy.tools.io.wcnf import load_wcnf +from cpmpy.tools.io.dimacs import load_dimacs + +model = load_opb("instance.opb") +model = load_wcnf("instance.wcnf") +model = load_dimacs("instance.cnf") + +# Formats backed by SCIP (requires pyscipopt) +from cpmpy.tools.io.scip import load_scip +model = load_scip("instance.mps", format="mps") +model = load_scip("instance.lp", format="lp") +``` + + + +All loaders also accept raw content strings. Useful when the content was already read into memory (or when creating your own content strings through a generator, see next section): + +```python +with open("instance.opb") as f: + content = f.read() +model = load_opb(content) # raw string works too +``` + +### Programmatic construction (problem generators) + +Because loaders accept strings, you can *generate* instance content programmatically +and let CPMpy parse it into a model. This is useful for problem generators, +random instance sampling, or templating — no need to write files to disk. + +**JSPLib** (Job Shop Scheduling): each line after the header is one job; pairs +are (machine, duration) per task: + +```python +from cpmpy.tools.io.jsplib import load_jsplib + +def make_jsplib(n_jobs, n_machines, durations): + """Build a JSPLib string from job data. durations: list of (machine, dur) per job.""" + lines = [f"{n_jobs} {len(durations[0])}"] + for job in durations: + parts = [f"{m} {d}" for m, d in job] + lines.append(" ".join(parts)) + return "\n".join(lines) + +# Example: 2 jobs × 2 tasks +content = make_jsplib(2, 2, [[(0, 5), (1, 3)], [(1, 2), (0, 4)]]) +# content is: +# 2 2 +# 0 5 1 3 +# 1 2 0 4 +model = load_jsplib(content) +model.solve() +``` + +**Nurse Rostering** (NRP): the format uses tagged sections (`SECTION_HORIZON`, +`SECTION_SHIFTS`, `SECTION_STAFF`, `SECTION_COVER`, etc.). Build each section +as a string (e.g. from templates or loops over staff/shift data) and join them; +then pass the full string to `load_nurserostering`. See the [NRP format +guide](https://schedulingbenchmarks.org/nrp/instances1_24.html) for the required +structure. + +**OPB** and other flat formats work the same way — construct the string, pass it +to the loader, and CPMpy returns a ready-to-solve model. This pattern is +especially handy for JSPLib, Nurse Rostering, and similar structured text formats +where you want to vary parameters or generate instances on the fly without +creating temporary files. + +### Writing a model + +Writing a model from CPMpy back to file is a very similar process to loading: + +```python +import cpmpy as cp +from cpmpy.tools.io import write + +x = cp.intvar(0, 10, name="x") +y = cp.intvar(0, 10, name="y") +model = cp.Model([x + y <= 5], minimize=x + y) + +write(model, "output.opb") # format auto-detected from extension +write(model, "out.txt", format="opb") # explicit format + +# Write to string instead of file (returns the string) +opb_string = write(model, format="opb") +``` + +Again, you can also directly use the format-specific writer functions: + +```python +from cpmpy.tools.io.opb import write_opb +from cpmpy.tools.io.dimacs import write_dimacs +from cpmpy.tools.io.scip import write_scip + +write_opb(model, "output.opb") +write_dimacs(model, "output.cnf") +write_scip(model, "output.mps", format="mps") +write_scip(model, "output.fzn", format="fzn") +``` + +### Handling compressed files + +Many benchmark archives use `.xz` or `.lzma` compression. Pass a custom `open` +argument to any loader: + +```python +import lzma +from cpmpy.tools.io.opb import load_opb + +model = load_opb("instance.opb.xz", open=lzma.open) +``` + +Writers follow the same convention: pass an `open` callable to control how the +output file is opened (for example to write compressed output): + +```python +import lzma +from cpmpy.tools.io.opb import write_opb + +xz_text = lambda path, mode="w": lzma.open(path, "wt") +write_opb(model, "output.opb.xz", open=xz_text) +``` + +--- + +## Datasets + +CPMpy datasets provide a PyTorch-style interface for collections of well-known +CO benchmark instances: download with a single one-liner, iterate over the files +in `(file_path, metadata)` pairs and use built-in transforms for loading and translation. + + +### Available datasets + +| Class | Domain | Format | +|-------|--------|--------| +| `XCSP3Dataset` | CP/COP | XCSP3 | +| `OPBDataset` | Pseudo-Boolean | OPB | +| `MaxSATEvalDataset` | MaxSAT | WCNF | +| `JSPLibDataset` | Job Shop Scheduling | JSPLib | +| `PSPLibDataset` | Project Scheduling | PSPLib | +| `NurseRosteringDataset` | Nurse Rostering | NRP | +| `MIPLibDataset` | Mixed Integer Programming | MPS | +| `SATDataset` | SAT | DIMACS (CNF) | + +### Basic iteration + +You can simply access the data within a dataset by iterating over its included +problem instances. If the data is not yet locally available, pass the `download=True` +optional argument and the dataset will be auto-downloaded from its original source. + +```python +from cpmpy.tools.datasets import JSPLibDataset + +dataset = JSPLibDataset(root="./data", download=True) +print(len(dataset), "instances") + +for file_path, info in dataset: + ... +``` + +Iterating over a dataset always returns 2-tuples. The first element is a problem instance identifier. +For now, all datasets are file-based, and thus the identifier will always be a filepath to the instance file. +In the furure this could hold other identifiers, like a database query. + + +The second element `info` is an `InstanceInfo` — a dict subclass described in +detail in the next section. It contains the metadata, both of the instance that it gets paired with and of +the dataset as a whole. More info on metadata can be found in ... + +### Loading instances into CPMpy models + +Use the dataset's `load` as the `transform` argument (PyTorch-style): + +```python +from cpmpy.tools.datasets import XCSP3Dataset + +dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True, + transform=dataset.load) + +for model, info in dataset: + model.solve() +``` + +Alternatively, call `load` on demand inside the loop: + +```python +from cpmpy.tools.datasets import XCSP3Dataset + +dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) + +for file_path, info in dataset: + model = dataset.load(file_path) + model.solve() +``` + +For more advanced loading, e.g. when you need a custom `open` callable, see [Dataset transform helpers](#dataset-transform-helpers-pytorch-style) for details. + +For adding model-level metadata (e.g. `model_features`, `model_objects`) via +transforms, see [Transform metadata enrichment](#transform-metadata-enrichment-advanced) below. + +### Translating to another format + +You can translate each instance to another format by looping over the dataset, +loading the instance into a CPMpy model, and calling a writer (or the unified +`write` function) to get a string or write to file. + +```python +from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.io import write + +dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) + +for file_path, info in dataset: + model = dataset.load(file_path) + opb_string = write(model, format="opb") # or write(model, "out.opb") + print(info.id, len(opb_string), "bytes") +``` + +For a one-step transform that does load + serialize in the pipeline (with +optional custom `open` and metadata enrichment), use the `Translate` helper; +see [Dataset transform helpers](#dataset-transform-helpers-pytorch-style). + +### Saving translated instances to disk + +Loop over the dataset, load each instance, and write the model to a file in the +target format. Use the instance metadata (e.g. `info.id`) to build output paths +if you want one file per instance. You can optionally write a `.meta.json` sidecar +yourself, or use the `SaveToFile` helper in the pipeline; see +[Dataset transform helpers](#dataset-transform-helpers-pytorch-style). + +```python +from pathlib import Path +from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.io import write + +dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) +out_dir = Path("./translated") +out_dir.mkdir(parents=True, exist_ok=True) + +for file_path, info in dataset: + model = dataset.load(file_path) + out_path = out_dir / f"{info.id.replace('/', '_')}.opb" + write(model, str(out_path)) # format inferred from extension + print("Saved:", out_path) +``` + +### Dataset transform helpers (PyTorch-style) + +The `cpmpy.tools.datasets.transforms` module provides composable transform classes +that you can assign to `dataset.transform` (or use inside `Compose`): + +| Helper | Purpose | +|--------|---------| +| **`Load`** | Load a file path into a CPMpy model. Accepts a custom `open` callable (e.g. for compressed files) and implements `enrich_metadata` to add `model_features` and `model_objects` to the instance metadata. | +| **`Open`** | Open a file path and return its raw text contents (with optional custom `open` for decompression). No parsing. | +| **`Serialize`** | Turn a CPMpy model into a string in a given format (e.g. `"opb"`, `"dimacs"`, `"mps"` or a writer function). | +| **`Translate`** | Load from one format and serialize to another in one step (e.g. XCSP3 → OPB). Uses a custom `open` for reading and enriches metadata from the intermediate model. | +| **`SaveToFile`** | Write the transform output (e.g. a string) to a file under a given directory; optional `.meta.json` sidecar. | +| **`Compose`** | Chain several transforms; each step's output is passed to the next, and each step's `enrich_metadata` (if present) is called with its own output. | +| **`Lambda`** | Wrap a callable as a transform (e.g. `Lambda(lambda path: path.strip())`). | + +Example — load with custom `open` and metadata enrichment: + +```python +from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.datasets.transforms import Load + +dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) +dataset.transform = Load(dataset.load, open=dataset.open) +for model, info in dataset: + # info.model_features, info.model_objects are populated by Load + model.solve() +``` + +Example — translate to another format on the fly: + +```python +from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.datasets.transforms import Translate + +dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) +dataset.transform = Translate(dataset.load, "opb", open=dataset.open) + +for opb_string, info in dataset: + print(len(opb_string), "bytes") +``` + +`Translate` accepts a format name string (`"opb"`, `"dimacs"`, `"mps"`, …) or a +writer function directly. Under the hood it loads the instance into a CPMpy model +and serializes it to the target format. + +Example — translate and save to disk (with optional metadata sidecar): + +```python +from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.datasets.transforms import Compose, Translate, SaveToFile + +dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) + +dataset.transform = Compose([ + Translate(dataset.load, "opb", open=dataset.open), + SaveToFile("./translated/", extension=".opb", write_metadata=True), +]) + +for output_path, info in dataset: + print("Saved:", output_path) +``` + +`SaveToFile` with `write_metadata=True` writes a `.meta.json` sidecar alongside +each file, capturing the portable instance metadata. + +Example — load to model, then serialize to string (Compose): + +```python +from cpmpy.tools.datasets.transforms import Compose, Load, Serialize + +dataset.transform = Compose([ + Load(dataset.load, open=dataset.open), + Serialize("opb"), +]) + +for opb_string, info in dataset: + ... +``` + +For more examples and custom transforms, see the [Transforms guide](transforms_guide.md). + +### Transform metadata enrichment (advanced) + +Transforms can be *classes* that implement an `enrich_metadata(self, data, metadata)` method. After each item is produced, the dataset calls this method so the transform can add or update metadata based on its output (e.g. the loaded model). That is how fields like `model_features` (variable/constraint counts, objective info) and `model_objects` (e.g. `variables`) appear in `info` when using `Load` — `Load` implements `enrich_metadata` and fills those in from the CPMpy model. + +Any custom transform class can do the same: implement `__call__` for the transformation and `enrich_metadata` to update metadata from the result. The dataset calls `enrich_metadata` automatically after `__call__`. For full details and examples, see [Updating metadata from a transform](#updating-metadata-from-a-transform-enrich_metadata) in the Enriching Metadata section. + +--- + +## Instance Metadata (`InstanceInfo`) + +Every dataset iteration yields an `InstanceInfo` as the second element. +`InstanceInfo` is a plain dict subclass — dict access works +unchanged — with additional structured properties. + +### Dict access + +```python +file, info = dataset[0] + +info["name"] # "abz5" +info.get("jobs", 0) # 10 +"optimum" in info # True +``` + +### Structured properties + +```python +info.id # "jsplib/abz5" — stable slash-separated identifier +info.domain_metadata # {"jobs": 10, "machines": 10, "optimum": 1234, …} +info.format_metadata # {"opb_num_variables": …} — only if in an OPB format +info.model_features # {"num_variables": …, "objective": …} — only after Load +info.model_objects # {"variables": {name: var}} — only after Load +``` + +The four metadata partitions: + +| Property | What it contains | Serializable | +|----------|-----------------|:---:| +| `domain_metadata` | Problem-level, format-independent fields (`jobs`, `machines`, `horizon`, …) | ✅ | +| `format_metadata` | Format-specific fields (`opb_*`, `wcnf_*`, `mps_*`, `xcsp_*`, `dimacs_*`) | ✅ | +| `model_features` | CP model statistics: variable counts, constraint counts, objective info | ✅ | +| `model_objects` | Live CPMpy objects: `variables` map — **only in-memory when the transform returns a CPMpy model (e.g. `dataset.load`, `Load`, `Translate`)** | ❌ | + +### Reading solution values from metadata + +Any dataset loader that returns a CPMpy model (including using the dataset's `load` +as the transform) populates `info.model_objects["variables"]` with a +`{name: CPMpy_variable}` mapping. After solving, you can read values directly +from that map without needing a separate reference to the variables: + +```python +from cpmpy.tools.datasets import JSPLibDataset + +dataset = JSPLibDataset(root="./data") +dataset.transform = dataset.load + +for model, info in dataset: + if model.solve(): + vars = info.model_objects["variables"] + print(f"{info['name']}: objective = {model.objective_value()}") + for name, var in vars.items(): + print(f" {name} = {var.value()}") +``` + +`model_objects` is intentionally excluded from [to_croissant()](#converting-to-standard-formats), +[to_gbd()](#converting-to-standard-formats), and [.meta.json sidecars](instance_metadata.md#why-model-objects-live-in-metadata) — the live variable objects exist +only for the duration of one iteration and cannot be serialised. + +### Converting to standard formats + +Instance metadata can be exported into standardized, interchange formats so that +benchmark records can be consumed by other tools, ML pipelines, or databases +without relying on CPMpy-specific types. Each format produces a plain Python +dict (JSON-serialisable) with a stable set of fields. Additional formats may +be added in future releases. + +| Format | Standard / use case | Method / adapter | +|------------|----------------------------------------------|-------------------------| +| **Croissant** | MLCommons Croissant 1.0 (dataset metadata) | `info.to_croissant()` | +| **GBD** | Global Benchmark Database-style features | `info.to_gbd()` | + +```python +info.to_croissant() # flat dict record for Croissant-style export +info.to_gbd() # flat dict record for GBD-style export +``` + +These adapters can also be passed directly as `target_transform`: + +```python +from cpmpy.tools.datasets.metadata import to_croissant + +dataset = JSPLibDataset(root="./data", target_transform=to_croissant) +for file_path, record in dataset: + print(record["id"], record["jobs"]) # plain dict, Croissant-compatible +``` + +--- + +## Enriching Metadata + +### Adding fields (most common) + +Return a plain dict delta from `target_transform` or use `|` inside the loop. +Everything else in `info` is preserved automatically. + +```python +# Via target_transform — applied automatically on every item +dataset = JSPLibDataset( + root="./data", + target_transform=lambda info: info | { + "density": info["jobs"] / info["machines"], + "has_optimum": info.get("optimum") is not None, + }, +) + +for file_path, info in dataset: + print(info["density"], info["has_optimum"]) +``` + +```python +# Or directly in the loop +for file_path, info in dataset: + enriched = info | {"difficulty": compute_difficulty(file_path)} +``` + +The `|` operator always returns a new `InstanceInfo`, so structured properties +remain available on the result. + +### Changing format + +When a transform produces a different file format, the old format-specific fields +should be dropped and new ones added. `without_format()` handles the drop; +chain it with `|` to add the new fields: + +```python +from cpmpy.tools.datasets.transforms import extract_format_metadata + +for opb_string, info in dataset: + new_info = info.without_format() | extract_format_metadata(opb_string, "opb") + # ↑ domain_metadata carried forward ↑ new opb_* fields added + print(new_info["jobs"]) # still there + print(new_info["opb_num_variables"]) # new +``` + +`without_format()` with no arguments strips format fields and carries everything +else forward. Chaining with `|` is optional — omit it if you just want to strip: + +```python +stripped = info.without_format() +assert not stripped.format_metadata +``` + +### Updating metadata from a transform (`enrich_metadata`) + +When you write a custom transform class, implement `enrich_metadata(self, data, +metadata)` to update metadata based on the transform's output. It is called +automatically by the dataset after `__call__` returns. + +```python +from cpmpy.tools.datasets.transforms import Translate, extract_format_metadata + +class TranslateToOPB: + """Translate a JSPLib instance to OPB format, updating metadata.""" + + def __init__(self, loader, open): + self._translate = Translate(loader, "opb", open=open) + + def __call__(self, file_path): + self._last_output = self._translate(file_path) + return self._last_output + + def enrich_metadata(self, data, metadata): + # data = OPB string from __call__ + # metadata = current InstanceInfo + return metadata.without_format() | extract_format_metadata(data, "opb") + + +dataset = JSPLibDataset(root="./data") +dataset.transform = TranslateToOPB(dataset.load, open=dataset.open) + +for opb_string, info in dataset: + print(info["jobs"]) # domain field: carried forward + print(info["opb_num_variables"]) # populated from new format +``` + +--- + +## Dataset-Level Metadata + +Every dataset class carries a `DatasetInfo` object with name, homepage, citation, etc, and a schema of the instance-level fields. + +The instance field schema lives in `info.features`: it is a `FeaturesInfo` object whose `fields` attribute is a dict mapping each field name to a `FieldInfo` (with `dtype`, `description`, and optionally `nullable` and `example`). Iterating over it lets you inspect what metadata fields the dataset declares and their types and descriptions: + +```python +info = JSPLibDataset.dataset_metadata() # no instance needed + +info.name # "jsplib" +info.homepage # "https://github.com/tamy0612/JSPLIB" +info.citation # ["J. Adams et al. …"] + +# Instance field schema: field_name → FieldInfo (dtype, description, nullable, example) +for field_name, fi in info.features.fields.items(): + print(field_name, fi.dtype, fi.description) +# Example output: +# jobs int Number of jobs +# machines int Number of machines +# optimum int Known optimal makespan, if available +# bounds dict Upper/lower bounds on the optimal makespan +``` + +For defining this schema when **creating your own dataset**, and for the full list of schema fields and shorthand forms, see [Instance Metadata — Declaring a metadata schema](instance_metadata.md#level-7--declaring-a-metadata-schema) and [Dataset authoring — Enriched dataset](dataset_authoring.md#enriched-dataset-optional-dataset-metadata-and-a-field-schema). There you will also find that **all schema fields are optional**: you can omit `features` entirely or use minimal declarations per field; the docs explain the defaults and what you lose by not defining fields fully. + +`DatasetInfo` is also a dict subclass, so `info["name"]` works alongside `info.name`. + +### Dataset card (HuggingFace convention) + +*Dataset cards* are standard README-style documents for a dataset: a short description, homepage, citations, and a table of instance metadata fields. They follow the [HuggingFace Hub dataset card](https://huggingface.co/docs/hub/datasets-cards) convention so that both humans and tooling can understand what the dataset contains without loading any instances. + +Use them as the README for a published dataset, as appendix material in papers, or to compare datasets. Generation requires no download — it uses only the class-level `DatasetInfo`. + +`card()` returns a single string: a **YAML frontmatter** block (for machine parsing) followed by a **Markdown** body (description, homepage, citations, instance features table, and a short usage example). + +```python +card = JSPLibDataset.card() # classmethod — no download needed +print(card) +``` + +Example output (abbreviated): + +``` +--- +name: jsplib +homepage: https://github.com/tamy0612/JSPLIB +citation: + - "J. Adams, E. Balas, D. Zawack. The shifting bottleneck procedure for job shop scheduling. Management Science, 1988." +--- + +# jsplib Dataset + +A collection of Job Shop Scheduling benchmark instances. + +**Homepage:** https://github.com/tamy0612/JSPLIB + +## License +MIT + +## Instance Features (Domain Metadata) +| Field | Type | Nullable | Description | +|----------|------|----------|--------------------------------| +| `jobs` | int | Yes | Number of jobs | +| `machines` | int | Yes | Number of machines | +... +``` + +### Croissant JSON-LD (MLCommons) + +[Croissant](https://mlcommons.org/working-groups/data/croissant/) is the MLCommons metadata standard for machine-learning datasets. A Croissant descriptor is a **JSON-LD** document that describes a dataset (name, description, homepage) and the schema of each instance (field names, types, descriptions). It is machine-readable and uses standard vocabularies (schema.org, Croissant `cr:` terms) so that crawlers, search engines, and ML tooling can discover and interpret the dataset without loading it. + +Use Croissant when you want to **publish a dataset** in a way that Google Dataset Search and other ML infrastructure can index, or when you need a **portable schema** (e.g. for validation or codegen). Like dataset cards, generation uses only `DatasetInfo` — no download of the actual dataset needed. + +`to_croissant()` returns a dict that you can serialize to JSON and publish next to your data (e.g. as `metadata.json`): + +```python +import json +croissant = JSPLibDataset.dataset_metadata().to_croissant() +print(json.dumps(croissant, indent=2)) +# Or save to file: json.dump(croissant, open("metadata.json", "w"), indent=2) +``` + +Example output (abbreviated): + +```json +{ + "@context": {"@vocab": "https://schema.org/", "cr": "http://mlcommons.org/croissant/1.0"}, + "@type": "sc:Dataset", + "name": "jsplib", + "description": "A collection of Job Shop Scheduling benchmark instances.", + "url": "https://github.com/tamy0612/JSPLIB", + "license": "MIT", + "cr:recordSet": [{ + "@type": "cr:RecordSet", + "name": "instances", + "cr:field": [ + {"@type": "cr:Field", "name": "id", "dataType": "sc:Text"}, + {"@type": "cr:Field", "name": "jobs", "dataType": "sc:Integer", "description": "Number of jobs"}, + {"@type": "cr:Field", "name": "machines", "dataType": "sc:Integer", "description": "Number of machines"}, + {"@type": "cr:Field", "name": "optimum", "dataType": "sc:Integer", "description": "Known optimal makespan, if available"}, + … + ] + }] +} +``` + +The `cr:recordSet` describes the shape of each instance (e.g. one row per file); `cr:field` lists the instance-level metadata fields and their schema.org types. The descriptor also includes standard CP model feature fields (e.g. `num_variables`, `num_constraints`) so that downstream tools know what to expect after loading. + +--- + +## Creating a Custom Dataset + +### Minimal dataset + +Subclass `FileDataset` and implement four things: + +```python +import cpmpy as cp +from cpmpy.tools.datasets import FileDataset + + +class MyDataset(FileDataset): + + # Required class attributes + name = "mydataset" + description = "A short description of the dataset." + homepage = "https://example.com/mydataset" + + def __init__(self, root=".", transform=None, target_transform=None, + download=False, metadata_workers=1): + import pathlib + super().__init__( + dataset_dir=pathlib.Path(root) / self.name, + transform=transform, target_transform=target_transform, + download=download, extension=".txt", + metadata_workers=metadata_workers, + ) + + @staticmethod + def _loader(content: str) -> cp.Model: + """Parse raw file content and return a CPMpy model.""" + # ... your parsing logic here ... + return cp.Model() + + def category(self) -> dict: + """Return category labels (e.g. year/track). Empty dict if none.""" + return {} + + def download(self): + """Download instances to self.dataset_dir.""" + raise NotImplementedError +``` + +### Adding rich metadata + +Declare optional class attributes for a fully documented dataset: + +```python +from cpmpy.tools.datasets.metadata import FeaturesInfo, FieldInfo + + +class MyDataset(FileDataset): + + name = "mydataset" + description = "A short description of the dataset." + homepage = "https://example.com/mydataset" + citation = ["Author et al. Title. Journal, 2024."] + + # Declares the per-instance metadata fields this dataset provides + features = FeaturesInfo({ + "num_jobs": ("int", "Number of jobs"), + "num_machines": ("int", "Number of machines"), + "optimum": FieldInfo("int", "Known optimal value", nullable=True), + }) + + def collect_instance_metadata(self, file) -> dict: + """Extract metadata from a single instance file.""" + # Return a dict whose keys match the fields declared in `features` + return { + "num_jobs": ..., + "num_machines": ..., + } + + # ... rest of the class as before ... +``` + +`card()` and `to_croissant()` use these attributes automatically — no extra work +needed. + +### Subclassing an existing dataset + +If you want to extend a dataset with additional fields, subclass it and declare +only the new fields. The framework merges parent and child schemas automatically: + +```python +class DifficultyJSPDataset(JSPLibDataset): + """JSPLib extended with a computed difficulty score.""" + + # Only the NEW field — {jobs, machines, optimum, …} are merged in automatically + features = FeaturesInfo({ + "difficulty": FieldInfo("float", "Makespan / num_jobs ratio", nullable=True), + }) + + def collect_instance_metadata(self, file) -> dict: + meta = super().collect_instance_metadata(file) # get parent fields + jobs = meta.get("jobs", 1) + makespan = meta.get("optimum") or (meta.get("bounds") or {}).get("upper") + if makespan and jobs: + meta["difficulty"] = round(makespan / jobs, 3) + return meta +``` + +The merged schema appears in `card()`, `to_croissant()`, and `validate()` without +any extra code: + +```python +info = DifficultyJSPDataset.dataset_metadata() +print(list(info.features.fields)) +# ['jobs', 'machines', 'optimum', 'bounds', 'instance_description', 'difficulty'] +``` + +You can also merge `FeaturesInfo` schemas directly using `|`: + +```python +# Explicit merge — same result as auto-merge, more verbose +class MyJSP(JSPLibDataset): + features = JSPLibDataset.features | FeaturesInfo({"difficulty": "float"}) +``` + +--- + +## Writing a Custom Transform + +Transforms can be **any callable**: a function or a lambda is enough when you only need to change the data. When you need to **update metadata from the transformed result** (e.g. add file size, or new format fields after translation), use a class that implements `enrich_metadata`. This section starts with simple callables, then describes their limitations, then introduces the class-based form. + +### Simple transforms: functions and lambdas + +The dataset calls your transform with the current item (file path, or the output of the previous transform in a pipeline) and uses the return value as the new item. A plain function or lambda is sufficient when you don't need to change metadata based on that result. + +```python +# Pass-through (no change) +dataset.transform = lambda x: x + +# Upper-case the path (silly but valid) +dataset.transform = lambda path: path.upper() if isinstance(path, str) else path + +# Load and return the raw file content +def load_raw(path): + with open(path) as f: + return f.read() +dataset.transform = load_raw + +for content, info in dataset: + print(len(content), info["name"]) +``` + +These work with `Compose` as well: any callable in the list is invoked in order, and the output of one becomes the input of the next. + +### Limitations of callable-only transforms + +A plain function or lambda **cannot** update metadata from the transformed data. The dataset only calls `enrich_metadata(data, metadata)` when the transform object has that method. So you cannot: + +- Add fields derived from the transform output (e.g. file size from the path, or `opb_num_variables` from the translated string). +- Strip old format metadata and attach new format fields when the transform changes format (e.g. WCNF → OPB). + +For metadata-only updates that don't depend on the transformed data, use **`target_transform`** instead (it receives the current `InstanceInfo` and returns an updated one). For updates that *do* depend on the transform output — or when you want to hold state (e.g. a loader, an `open` callable) in a clear way — use a **class-based transform** with `__call__` and optionally `enrich_metadata`. + +### Class-based transforms + +All transforms follow the same protocol: a callable `__call__(self, data)` that transforms the data, and an optional `enrich_metadata(self, data, metadata)` method that updates the instance metadata based on the transformed data. + +```python +class MyTransform: + + def __call__(self, file_path: str) -> Any: + """ + Transform the data. Receives the file path (or the output of the + previous transform in a Compose chain) and returns anything. + """ + ... + + def enrich_metadata(self, data, metadata: InstanceInfo) -> InstanceInfo: + """ + Update metadata based on the output of __call__. + + - data : the value returned by __call__ + - metadata : the current InstanceInfo for this instance + - returns : updated InstanceInfo + + Called automatically by the dataset after __call__ returns. + Omit this method if your transform does not affect metadata. + """ + return metadata | {"my_field": compute(data)} +``` + +### Example: annotating instances with file size + +```python +class AnnotateFileSize: + + def __call__(self, file_path): + return file_path # pass through unchanged + + def enrich_metadata(self, data, metadata): + import os + return metadata | {"file_size_bytes": os.path.getsize(data)} + + +dataset = JSPLibDataset(root="./data") +dataset.transform = AnnotateFileSize() + +for file_path, info in dataset: + print(info["file_size_bytes"]) +``` + +### Example: format-changing transform + +When `__call__` produces output in a different format, use `without_format()` in +`enrich_metadata` to drop the old format fields and add the new ones: + +```python +from cpmpy.tools.datasets.transforms import Translate, extract_format_metadata + +class TranslateToDIMACS: + + def __init__(self, loader, open): + self._translate = Translate(loader, "dimacs", open=open) + + def __call__(self, file_path): + self._last_output = self._translate(file_path) + return self._last_output + + def enrich_metadata(self, data, metadata): + dimacs_fields = extract_format_metadata(data, "dimacs") + return metadata.without_format() | dimacs_fields +``` + +### Composing transforms + +Chain multiple transforms with `Compose`. Each step's `enrich_metadata` is called +with the output that step produced, so each transform sees its own output: + +```python +from cpmpy.tools.datasets.transforms import Compose, Load, Serialize + +dataset.transform = Compose([ + Load(dataset.load, open=dataset.open), # file_path → CPMpy model + Serialize("opb"), # CPMpy model → OPB string +]) + +# Load.enrich_metadata receives the model and adds model_features +# Serialize has no enrich_metadata — no metadata changes at that step +``` + +--- + +## Examples + +Runnable examples are in `examples/datasets/`: + +| File | Covers | +|------|--------| +| `01_basic_usage.py` | Iterating, dict access, `InstanceInfo` properties | +| `02_dataset_card_and_croissant.py` | `DatasetInfo`, `card()`, Croissant export | +| `03_target_transforms.py` | `target_transform`, `to_croissant`, `to_gbd` | +| `04_custom_dataset.py` | Minimal, enriched, and subclassed dataset classes | +| `05_features_merge.py` | `FeaturesInfo \|`, auto-merge, multi-level inheritance | +| `06_benchmark_survey.py` | Iterating all datasets, collecting metadata statistics | +| `07_metadata_enrichment.py` | `\|`, `without_format()`, `enrich_metadata` | + +--- + +## Further Reading + +- [Datasets](datasets.md) — dataset quickstart and pipelines +- [Instance Metadata](instance_metadata.md) — full guide to `InstanceInfo`, enrichment, and interoperability +- [Transforms guide](transforms_guide.md) — authoring transforms and analytics pipelines +- [Dataset authoring](dataset_authoring.md) — implementing datasets, loaders, metadata schemas +- [Benchmarking workflows](benchmarking_workflows.md) — dataset-driven experiment patterns +- [Datasets API](api/tools/datasets.rst) +- [Benchmark runner](api/tools/benchmark_runner.rst) From b542c6e00cfbee4de69bb7d7542eb794f8c99155 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 11 Mar 2026 09:53:13 +0100 Subject: [PATCH 145/152] Wcnf reader and writer --- cpmpy/tools/io/dimacs.py | 212 +++++++++++++++++++++++++++++++-------- cpmpy/tools/io/wcnf.py | 7 +- 2 files changed, 177 insertions(+), 42 deletions(-) diff --git a/cpmpy/tools/io/dimacs.py b/cpmpy/tools/io/dimacs.py index 3f16776f3..604ef665d 100644 --- a/cpmpy/tools/io/dimacs.py +++ b/cpmpy/tools/io/dimacs.py @@ -15,18 +15,69 @@ import cpmpy as cp -from cpmpy.expressions.variables import _BoolVarImpl, NegBoolView -from cpmpy.expressions.core import Operator, Comparison +from cpmpy.expressions.variables import _BoolVarImpl, NegBoolView, _IntVarImpl +from cpmpy.expressions.core import Operator from cpmpy.transformations.normalize import toplevel_list from cpmpy.transformations.to_cnf import to_cnf from cpmpy.transformations.get_variables import get_variables +from cpmpy.transformations.safening import safen_objective +from cpmpy.transformations.flatten_model import flatten_objective +from cpmpy.transformations.linearize import decompose_linear_objective, only_positive_coefficients_ +from cpmpy.transformations.int2bool import _encode_lin_expr -import re from typing import Optional, Callable import builtins -def write_dimacs(model, fname=None, encoding="auto", header:Optional[str]="DIMACS file written by CPMpy", open: Optional[Callable]=None): + +def _transform_objective(expr, encoding="auto"): + """ + Transform objective into weighted Boolean literals plus helper constraints. + + Returns: + (weights, xs, const, extra_cons) + """ + csemap, ivarmap = dict(), dict() + obj, safe_cons = safen_objective(expr) + obj, decomp_cons = decompose_linear_objective( + obj, + supported=frozenset(), + supported_reified=frozenset(), + csemap=csemap, + ) + obj, flat_cons = flatten_objective(obj, csemap=csemap) + + weights, xs, const = [], [], 0 + # we assume obj is a var, a sum or a wsum (over int and bool vars) + if isinstance(obj, _IntVarImpl) or isinstance(obj, NegBoolView): # includes _BoolVarImpl + weights = [1] + xs = [obj] + elif obj.name == "sum": + xs = obj.args + weights = [1] * len(xs) + elif obj.name == "wsum": + weights, xs = obj.args + else: + raise NotImplementedError(f"DIMACS: Non supported objective {obj} (yet?)") + + terms, enc_cons, k = _encode_lin_expr(ivarmap, xs, weights, encoding, csemap=csemap) + const += k + + extra_cons = safe_cons + decomp_cons + flat_cons + enc_cons + + # remove terms with coefficient 0 (`only_positive_coefficients_` may return them and RC2 does not accept them) + terms = [(w, x) for w, x in terms if w != 0] + if len(terms) == 0: + return [], [], const, extra_cons + + ws, xs = zip(*terms) # unzip + new_weights, new_xs, k = only_positive_coefficients_(ws, xs) + const += k + + return list(new_weights), list(new_xs), const, extra_cons + + +def write_dimacs(model, fname=None, encoding="auto", p_header:bool=False, header:Optional[str]="DIMACS file written by CPMpy", open: Optional[Callable]=None): """ Writes CPMpy model to DIMACS format Uses the "to_cnf" transformation from CPMpy @@ -37,6 +88,7 @@ def write_dimacs(model, fname=None, encoding="auto", header:Optional[str]="DIMAC :param model: a CPMpy model :param fname: optional, file name to write the DIMACS output to :param encoding: the encoding used for `int2bool`, choose from ("auto", "direct", "order", or "binary") + :param p_header: whether to include the ``p ...`` problem header line (default: ``False``) :param open: optional callable to open the file for writing (default: builtin ``open``). Called as ``open(fname, "w")``. This mirrors the ``open=`` argument in loaders and allows custom compression or I/O (e.g. @@ -44,15 +96,23 @@ def write_dimacs(model, fname=None, encoding="auto", header:Optional[str]="DIMAC """ if model.has_objective(): - raise ValueError("DIMACS format does not support objective functions") + hard_prefix = "h " + else: + hard_prefix = "" constraints = toplevel_list(model.constraints) + objective_lits = [] + objective_weights = [] + if model.has_objective(): + objective_weights, objective_lits, _, extra_cons = _transform_objective(model.objective_, encoding=encoding) + constraints += extra_cons constraints = to_cnf(constraints, encoding=encoding) - vars = get_variables(constraints) + vars = get_variables(constraints + objective_lits) mapping = {v : i+1 for i, v in enumerate(vars)} + out = "" - out = f"p cnf {len(vars)} {len(constraints)}\n" + for cons in constraints: if isinstance(cons, _BoolVarImpl): @@ -71,7 +131,25 @@ def write_dimacs(model, fname=None, encoding="auto", header:Optional[str]="DIMAC else: raise ValueError(f"Expected Boolean variable in clause, but got {v} which is of type {type(v)}") - out += " ".join(ints + ["0"]) + "\n" + out += hard_prefix + " ".join(ints + ["0"]) + "\n" + + if model.has_objective(): + max_weight = max(objective_weights) + for w, x in zip(objective_weights, objective_lits): + if isinstance(x, NegBoolView): + lit = -mapping[x._bv] + elif isinstance(x, _BoolVarImpl): + lit = mapping[x] + else: + raise ValueError(f"Expected Boolean literal in objective, but got {x} of type {type(x)}") + transformed_weight = max_weight - w if model.objective_is_min else w + out += f"{transformed_weight} {lit} 0\n" + + if p_header: + out = f"p wcnf {len(vars)} {len(constraints)} {max(objective_weights)}\n" + out + else: + if p_header: + out = f"p cnf {len(vars)} {len(constraints)}\n" + out if header is not None: header_lines = ["c " + line for line in header.splitlines()] @@ -100,41 +178,95 @@ def load_dimacs(fname, open=None): if open is None: open = builtins.open + with open(fname, "r") as f: + lines = f.readlines() + + # Auto-detect weighted instances: + # - explicit `p wcnf ...` header + # - any hard-clause line starting with `h` + # - no header but all non-comment clause lines look weighted (weight literals... 0) + is_weighted = False + weighted_compatible = True + saw_clause_line = False + for raw in lines: + line = raw.strip() + if line == "" or line.startswith("c"): + continue + if line.startswith("p"): + params = line.split() + assert len(params) >= 4, f"Expected p-header to be formed `p ...` but got {line}" + _, typ, *_ = params + if typ == "wcnf": + is_weighted = True + elif typ != "cnf": + raise ValueError(f"Expected `cnf` or `wcnf` as file format, but got {typ} which is not supported.") + break + if line.startswith("h"): + is_weighted = True + break + saw_clause_line = True + try: + ints = [int(tok) for tok in line.split()] + except ValueError: + weighted_compatible = False + continue + if len(ints) < 2 or ints[-1] != 0 or ints[0] < 0: + weighted_compatible = False + + if not is_weighted and saw_clause_line and weighted_compatible: + is_weighted = True + + if is_weighted: + from cpmpy.tools.io.wcnf import load_wcnf + return load_wcnf(fname, open=open) + + # CNF parse (strict with p-line counts when present, inferred otherwise) m = cp.Model() + clause = [] + clauses = [] + nr_vars_declared = None + nr_cls_declared = None + max_var = 0 - with open(fname, "r") as f: - clause = [] - nr_vars = None - for line in f.readlines(): - if line == "" or line.startswith("c"): - continue # skip empty and comment lines - elif line.startswith("p"): - params = line.strip().split(" ") - assert len(params) == 4, f"Expected p-header to be formed `p cnf nr_vars nr_cls` but got {line}" - _,typ,nr_vars,nr_cls = params - if typ != "cnf": - raise ValueError("Expected `cnf` (i.e. DIMACS) as file format, but got {typ} which is not supported.") - nr_vars = int(nr_vars) - if nr_vars>0: - bvs = cp.boolvar(shape=nr_vars) - nr_cls = int(nr_cls) + for raw in lines: + line = raw.strip() + if line == "" or line.startswith("c"): + continue # skip empty and comment lines + if line.startswith("p"): + params = line.split() + assert len(params) == 4, f"Expected p-header to be formed `p cnf nr_vars nr_cls` but got {line}" + _, typ, nr_vars, nr_cls = params + if typ != "cnf": + raise ValueError(f"Expected `cnf` (i.e. DIMACS) as file format, but got {typ} which is not supported.") + nr_vars_declared = int(nr_vars) + nr_cls_declared = int(nr_cls) + continue + + for token in line.split(): + i = int(token) + if i == 0: + clauses.append(clause) + clause = [] else: - assert nr_vars is not None, "Expected p-line before first clause" - for token in line.strip().split(): - i = int(token.strip()) - if i == 0: - m += cp.any(clause) - clause = [] - else: - var=abs(i)-1 - assert var < nr_vars, "Expected at most {nr_vars} variables (from p-line) but found literal {i} in clause {line}" - bv = bvs[var] - - clause.append(bv if i > 0 else ~bv) - - assert nr_vars is not None, "Expected file to contain p-line, but did not" - assert len(clause) == 0, f"Expected last clause to be terminated by 0, but it was not" - assert len(m.constraints) == nr_cls, f"Number of clauses was declared in p-line as {nr_cls}, but was {len(m.constraints)}" + max_var = max(max_var, abs(i)) + clause.append(i) + + assert len(clause) == 0, "Expected last clause to be terminated by 0, but it was not" + + nr_vars = nr_vars_declared if nr_vars_declared is not None else max_var + if nr_vars_declared is not None: + assert max_var <= nr_vars_declared, f"Expected at most {nr_vars_declared} variables (from p-line) but found literal index {max_var}" + + bvs = cp.boolvar(shape=nr_vars) if nr_vars > 0 else [] + for cl in clauses: + lits = [] + for i in cl: + bv = bvs[abs(i)-1] + lits.append(bv if i > 0 else ~bv) + m += cp.any(lits) + + if nr_cls_declared is not None: + assert len(m.constraints) == nr_cls_declared, f"Number of clauses was declared in p-line as {nr_cls_declared}, but was {len(m.constraints)}" return m diff --git a/cpmpy/tools/io/wcnf.py b/cpmpy/tools/io/wcnf.py index 46f140388..ae536bb7e 100644 --- a/cpmpy/tools/io/wcnf.py +++ b/cpmpy/tools/io/wcnf.py @@ -56,7 +56,7 @@ def load_wcnf(wcnf: Union[str, os.PathLike], open=open) -> cp.Model: # If wcnf is a path to a file -> open file if isinstance(wcnf, (str, os.PathLike)) and os.path.exists(wcnf): if open is not None: - f = open(wcnf) + f = open(wcnf, "rt") else: f = _std_open(wcnf, "rt") # If wcnf is a string containing a model -> create a memory-mapped file @@ -66,7 +66,6 @@ def load_wcnf(wcnf: Union[str, os.PathLike], open=open) -> cp.Model: model = cp.Model() vars = {} soft_terms = [] - for raw in f: line = raw.strip() @@ -74,6 +73,10 @@ def load_wcnf(wcnf: Union[str, os.PathLike], open=open) -> cp.Model: if not line or line.startswith("c"): continue + # Problem line: ignore header + if line.startswith("p"): + continue + # Hard clause if line[0] == "h": literals = map(int, line[1:].split()) From 2aa5ed752238ddbad112fb9b17b9565778c36641 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 11 Mar 2026 11:20:16 +0100 Subject: [PATCH 146/152] Dimacs loader support raw strings --- cpmpy/tools/io/dimacs.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/cpmpy/tools/io/dimacs.py b/cpmpy/tools/io/dimacs.py index 604ef665d..ddd06276a 100644 --- a/cpmpy/tools/io/dimacs.py +++ b/cpmpy/tools/io/dimacs.py @@ -13,6 +13,8 @@ An integer represents a Boolean variable and a negative Boolean variable is represented using a `'-'` sign. """ +import os + import cpmpy as cp from cpmpy.expressions.variables import _BoolVarImpl, NegBoolView, _IntVarImpl @@ -26,7 +28,7 @@ from cpmpy.transformations.linearize import decompose_linear_objective, only_positive_coefficients_ from cpmpy.transformations.int2bool import _encode_lin_expr -from typing import Optional, Callable +from typing import Optional, Callable, Union import builtins @@ -163,7 +165,7 @@ def write_dimacs(model, fname=None, encoding="auto", p_header:bool=False, header return out -def load_dimacs(fname, open=None): +def load_dimacs(dimacs: Union[str, os.PathLike], open=None): """ Load a CPMpy model from a DIMACS formatted file strictly following the specification: https://web.archive.org/web/20190325181937/https://www.satcompetition.org/2009/format-benchmarks2009.html @@ -171,15 +173,20 @@ def load_dimacs(fname, open=None): .. note:: The p-line has to denote the correct number of variables and clauses - :param fname: the name of the DIMACS file + :param dimacs: + - A file path to a DIMACS/WCNF file + - OR a string containing DIMACS/WCNF content directly :param open: optional callable to open the file for reading (default: builtin ``open``). Use for decompression, e.g. ``lambda p: lzma.open(p, 'rt')`` for ``.cnf.xz``. """ if open is None: open = builtins.open - with open(fname, "r") as f: - lines = f.readlines() + if isinstance(dimacs, (str, os.PathLike)) and os.path.exists(dimacs): + with open(dimacs, "r") as f: + lines = f.readlines() + else: + lines = str(dimacs).splitlines() # Auto-detect weighted instances: # - explicit `p wcnf ...` header @@ -218,7 +225,7 @@ def load_dimacs(fname, open=None): if is_weighted: from cpmpy.tools.io.wcnf import load_wcnf - return load_wcnf(fname, open=open) + return load_wcnf(dimacs, open=open) # CNF parse (strict with p-line counts when present, inferred otherwise) m = cp.Model() From b3478bfd5b8e68065820b7eed3b42c54eece0c30 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 11 Mar 2026 11:20:46 +0100 Subject: [PATCH 147/152] Consistent IO naming --- cpmpy/tools/io/nurserostering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpmpy/tools/io/nurserostering.py b/cpmpy/tools/io/nurserostering.py index 111625a41..aac43a651 100644 --- a/cpmpy/tools/io/nurserostering.py +++ b/cpmpy/tools/io/nurserostering.py @@ -27,7 +27,7 @@ from cpmpy.tools.datasets.nurserostering import ( parse_scheduling_period, - nurserostering_model + model_nurserostering ) @@ -61,7 +61,7 @@ def load_nurserostering(instance: Union[str, os.PathLike], open=open) -> cp.Mode data = parse_scheduling_period(fname) # Create the CPMpy model using the existing model builder - model, _ = nurserostering_model(**data) + model, _ = model_nurserostering(**data) return model finally: From 974df6b6039422cceddec6e90ea6370b6c6386f0 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 11 Mar 2026 11:24:43 +0100 Subject: [PATCH 148/152] Make datasets consistent with paper: remove loader --- cpmpy/tools/datasets/__init__.py | 15 +++- cpmpy/tools/datasets/core.py | 116 ++++++++++++++----------- cpmpy/tools/datasets/jsplib.py | 17 +--- cpmpy/tools/datasets/metadata.py | 3 +- cpmpy/tools/datasets/miplib.py | 38 +------- cpmpy/tools/datasets/mse.py | 15 ---- cpmpy/tools/datasets/nurserostering.py | 29 ++----- cpmpy/tools/datasets/opb.py | 15 ---- cpmpy/tools/datasets/psplib.py | 45 +++++----- cpmpy/tools/datasets/sat.py | 18 +--- cpmpy/tools/datasets/transforms.py | 21 ++--- cpmpy/tools/datasets/xcsp3.py | 16 ---- 12 files changed, 121 insertions(+), 227 deletions(-) diff --git a/cpmpy/tools/datasets/__init__.py b/cpmpy/tools/datasets/__init__.py index 600c78900..943c4f663 100644 --- a/cpmpy/tools/datasets/__init__.py +++ b/cpmpy/tools/datasets/__init__.py @@ -37,6 +37,14 @@ "OPBDataset", "MaxSATEvalDataset", "SATDataset", + "DIMACSCliqueDataset", + # Parse/model helpers for parse-first datasets + "parse_jsp", + "model_jobshop", + "parse_rcpsp", + "model_rcpsp", + "parse_scheduling_period", + "model_nurserostering", # Transforms "Compose", "Open", @@ -49,13 +57,14 @@ "extract_format_metadata", ] from .miplib import MIPLibDataset -from .jsplib import JSPLibDataset -from .psplib import PSPLibDataset -from .nurserostering import NurseRosteringDataset +from .jsplib import JSPLibDataset, parse_jsp, model_jobshop +from .psplib import PSPLibDataset, parse_rcpsp, model_rcpsp +from .nurserostering import NurseRosteringDataset, parse_scheduling_period, model_nurserostering from .xcsp3 import XCSP3Dataset from .opb import OPBDataset from .mse import MaxSATEvalDataset from .sat import SATDataset +from .dimacs_clique import DIMACSCliqueDataset from .transforms import Compose, Open, Load, Serialize, Translate, SaveToFile, Lambda, extract_format_metadata # Backward compatibility alias Parse = Load diff --git a/cpmpy/tools/datasets/core.py b/cpmpy/tools/datasets/core.py index b9a7fcc77..5af317e09 100644 --- a/cpmpy/tools/datasets/core.py +++ b/cpmpy/tools/datasets/core.py @@ -17,7 +17,6 @@ To implement a new dataset, one needs to subclass one of the abstract dataset classes, and provide implementation for the following methods: -- _loader: loads a CPMpy model from a string representation of the instance (file) - category: return a dictionary of category labels, describing to which subset the dataset has been restricted (year, track, ...) - download: download the dataset (helper function :func:`_download_file` is provided) @@ -79,7 +78,7 @@ from cpmpy.tools.io import load_wcnf from cpmpy.tools.datasets.metadata import to_croissant - dataset = MyDataset(download=True, transform=load_wcnf(x), target_transform=to_croissant) + dataset = MyDataset(download=True, transform=load_wcnf, target_transform=to_croissant) for model, croissant_record in dataset: ... @@ -88,7 +87,7 @@ Example: .. code-block:: python - dataset = MyDataset(download=True, transform=load_wcnf(x), target_transform=to_croissant) + dataset = MyDataset(download=True, transform=load_wcnf, target_transform=to_croissant) from torch.utils.data import random_split train_dataset, test_dataset = random_split(dataset, [0.8, 0.2]) @@ -596,6 +595,17 @@ class FileDataset(IndexedDataset): Either have a look at one of the concrete subclasses, providing access to well-known datasets from the community, or use this class as the base for your own dataset. + Two dataset styles are supported: + + - Model-defined instances: files directly encode variables/constraints/objective + (for example XCSP3, OPB, DIMACS, FlatZinc). In this case, users typically + pass a loader from ``cpmpy.tools.io`` as ``transform``. + - Data-only instances: files encode problem data for a fixed family, but no + model. In this case, subclasses should override ``parse()`` and users can + enable ``parse=True`` to obtain parsed intermediate data structures + (for example table/dict structures for RCPSP-style scheduling data), then + build a model separately or via a transform. + For a more detailed authoring guide (design patterns, metadata conventions, and implementation checklist), see :ref:`datasets_advanced_authoring`. """ @@ -608,6 +618,7 @@ def __init__( dataset_dir: str = ".", transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False, + parse: bool = False, extension: str = ".txt", **kwargs ): @@ -619,6 +630,9 @@ def __init__( transform (callable, optional): Optional transform applied to the instance file path. target_transform (callable, optional): Optional transform applied to the metadata dictionary. download (bool): If True, downloads the dataset if it does not exist locally (default=False). + parse (bool): If True, run ``self.parse(instance_path)`` before + applying ``transform``. Intended for data-only datasets that do + not directly encode a CPMpy model in the source file. extension (str): Extension of the instance files. Used to filter instance files from the dataset directory. **kwargs: Advanced options. Currently supports: - metadata_workers (int): Number of parallel workers for @@ -635,6 +649,7 @@ def __init__( self.dataset_dir = pathlib.Path(dataset_dir) self.extension = extension + self._parse = parse # Advanced options metadata_workers = kwargs.pop("metadata_workers", 1) @@ -666,20 +681,6 @@ def _check_exists(self) -> bool: # Methods to implement in subclasses: # # ---------------------------------------------------------------------------- # - @staticmethod - @abstractmethod - def _loader(content: str) -> cp.Model: - """ - Loader for the dataset. Loads a CPMpy model from raw file content string. - The content will be the raw text content of the file (already decompressed). - - Arguments: - content (str): Raw file content string to load into a model. - - Returns: - cp.Model: The loaded CPMpy model. - """ - pass @abstractmethod def categories(self) -> dict: @@ -744,37 +745,30 @@ def read(self, instance: os.PathLike) -> str: with self.open(instance) as f: return f.read() + def parse(self, instance: os.PathLike): + """ + Parse an instance file into intermediate data structures. - # ---------------------------------------------------------------------------- # - # Public interface # - # ---------------------------------------------------------------------------- # + Override this for datasets whose files contain problem data but not an + explicit model. Typical outputs are structures like tables, arrays, and + dictionaries that can then be passed to a separate model-construction + function. + + Default behavior is ``read(instance)``, i.e. return raw text content. - def load(self, instance: Union[str, os.PathLike]) -> cp.Model: - """ - Load a CPMpy model from an instance file. - - Uses `.read()` to handle reading (decompressing + reading raw contents) and then turns - raw contents into a CPMpy model via `.loader()`. - Arguments: - instance (str or os.PathLike): - - File path to the instance file - - OR a string containing the instance content directly - + instance (os.PathLike): File path to the instance file. + Returns: - cp.Model: The loaded CPMpy model. + The parsed intermediate data structure(s). """ + return self.read(instance) - # If instance is a path to a file -> open file - if isinstance(instance, (str, os.PathLike)) and os.path.exists(instance): - # Reading - use read() to decompress and read raw file contents - content = self.read(instance) - # If instance is a string containing a model -> use it directly - else: - content = instance - # Loading - turn raw contents into CPMpy model - return self._loader(content) + # ---------------------------------------------------------------------------- # + # Public interface # + # ---------------------------------------------------------------------------- # + def instance_metadata(self, instance: os.PathLike) -> InstanceInfo: """ @@ -855,16 +849,41 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: if self.target_transform: metadata = self.target_transform(metadata) + data = filename + + # Built-in parse stage: parse the instance file into intermediate data structures. + # Mostly meant for datasets where files represent data and modeling is separate. + if self._parse: + data = self.parse(data) + if self.transform: - filename = self.transform(filename) + # TODO revisit this flow of execution + if isinstance(data, (str, os.PathLike)): + # Convenience for io loaders: pass dataset.open when supported. + try: + data = self.transform(data, open=self.open) + except TypeError: + data = self.transform(data) + else: + try: + data = self.transform(data) + except TypeError as exc: + # Convenience for parse-first datasets where parse() returns + # tuples and model builders take positional args. + if isinstance(data, tuple): + data = self.transform(*data) + else: + raise exc # Let transforms contribute to metadata (e.g. model verification info) if hasattr(self.transform, 'enrich_metadata'): - metadata = self.transform.enrich_metadata(filename, metadata) - elif isinstance(filename, cp.Model): - # Transform returned a CPMpy model (e.g. dataset.load); enrich from model - metadata = _enrich_from_model(filename, metadata) + metadata = self.transform.enrich_metadata(data, metadata) + elif isinstance(data, cp.Model): + # Transform returned a CPMpy model; enrich metadata from model details. + metadata = _enrich_from_model(data, metadata) + elif isinstance(data, cp.Model): + metadata = _enrich_from_model(data, metadata) - return filename, metadata + return data, metadata # ---------------------------- Metadata collection --------------------------- # @@ -1208,9 +1227,6 @@ def url(self) -> str: def citation(self) -> List[str]: raise NotImplementedError("Arbitrary file dataset does not support a citation. Please implement this method in a subclass, or use a more specific dataset class.") - def _loader(self, file: os.PathLike) -> cp.Model: - raise NotImplementedError("Arbitrary file dataset does not support loading. Please implement this method in a subclass, or use a more specific dataset class.") - def category(self) -> dict: raise NotImplementedError("Arbitrary file dataset does not support categories. Please implement this method in a subclass, or use a more specific dataset class.") diff --git a/cpmpy/tools/datasets/jsplib.py b/cpmpy/tools/datasets/jsplib.py index b8dc529bc..2994582ff 100644 --- a/cpmpy/tools/datasets/jsplib.py +++ b/cpmpy/tools/datasets/jsplib.py @@ -39,11 +39,6 @@ class JSPLibDataset(FileDataset): # torch.utils.data.Dataset compatible "E. Taillard. 'Benchmarks for basic scheduling problems', European Journal of Operational Research, Vol. 64, Issue 2, pp. 278-285, 1993.", ] - version = "1.0.0" - license = "MIT" - domain = "scheduling" - tags = ["optimization", "job-shop-scheduling", "scheduling", "combinatorial"] - language = "JSPLib" features = FeaturesInfo({ "jobs": ("int", "Number of jobs"), "machines": ("int", "Number of machines"), @@ -76,15 +71,11 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl **kwargs ) - @staticmethod - def _loader(content: str): + def parse(self, instance: os.PathLike): """ - Loader for JSPLib dataset. - Loads a CPMpy model from raw JSPLib content string. + Parse a JSPLib instance into task routing and durations. """ - from cpmpy.tools.io.jsplib import load_jsplib - # load_jsplib already supports raw strings - return load_jsplib(content) + return parse_jsp(instance) def category(self) -> dict: return {} # no categories @@ -211,7 +202,7 @@ def parse_jsp(filename: str): return task_to_machines, task_durations -def jobshop_model(task_to_machines, task_durations): +def model_jobshop(task_to_machines, task_durations): """ Create a CPMpy model for the Jobshop problem. diff --git a/cpmpy/tools/datasets/metadata.py b/cpmpy/tools/datasets/metadata.py index 82f804cbd..3e3cf8687 100644 --- a/cpmpy/tools/datasets/metadata.py +++ b/cpmpy/tools/datasets/metadata.py @@ -420,7 +420,8 @@ def model_objects(self) -> dict: .. code-block:: python - dataset.transform = Load(dataset.loader, open=dataset.open) + from cpmpy.tools.io import load_jsplib + dataset.transform = Load(load_jsplib, open=dataset.open) model, info = dataset[0] vars = info.model_objects["variables"] diff --git a/cpmpy/tools/datasets/miplib.py b/cpmpy/tools/datasets/miplib.py index 2d9def666..f6300901f 100644 --- a/cpmpy/tools/datasets/miplib.py +++ b/cpmpy/tools/datasets/miplib.py @@ -29,12 +29,7 @@ class MIPLibDataset(FileDataset): # torch.utils.data.Dataset compatible "Gleixner, A., Hendel, G., Gamrath, G., Achterberg, T., Bastubbe, M., Berthold, T., Christophel, P. M., Jarck, K., Koch, T., Linderoth, J., Lubbecke, M., Mittelmann, H. D., Ozyurt, D., Ralphs, T. K., Salvagnin, D., and Shinano, Y. MIPLIB 2017: Data-Driven Compilation of the 6th Mixed-Integer Programming Library. Mathematical Programming Computation, 2021. https://doi.org/10.1007/s12532-020-00194-3.", ] - version = "2017" - license = "CC BY 4.0" - domain = "mip" - tags = ["optimization", "mixed-integer-programming", "mip", "combinatorial"] - language = "MPS" - + def __init__( self, @@ -73,37 +68,6 @@ def __init__( **kwargs ) - @staticmethod - def reader(file_path, open=open): - """ - Reader for MIPLib dataset. - Parses a file path directly into a CPMpy model. - For backward compatibility. Consider using read() + load() instead. - """ - from cpmpy.tools.io.scip import load_scip - return load_scip(file_path, open=open) - - @staticmethod - def _loader(content: str): - """ - Loader for MIPLib dataset. - Loads a CPMpy model from raw MPS/LP content string. - Note: SCIP requires a file, so content is written to a temporary file. - """ - import tempfile - import os - from cpmpy.tools.io.scip import load_scip - - # SCIP requires a file path, so write content to temp file - with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.mps') as tmp: - tmp.write(content) - tmp_path = tmp.name - - try: - return load_scip(tmp_path) - finally: - os.unlink(tmp_path) - def category(self) -> dict: return { "year": self.year, diff --git a/cpmpy/tools/datasets/mse.py b/cpmpy/tools/datasets/mse.py index c5e540de6..d6ca80224 100644 --- a/cpmpy/tools/datasets/mse.py +++ b/cpmpy/tools/datasets/mse.py @@ -12,8 +12,6 @@ import pathlib import io -import cpmpy as cp -from cpmpy.tools.io.wcnf import load_wcnf from cpmpy.tools.datasets.core import FileDataset from cpmpy.tools.datasets.metadata import FeaturesInfo @@ -39,11 +37,6 @@ class MaxSATEvalDataset(FileDataset): # torch.utils.data.Dataset compatible homepage = "https://maxsat-evaluations.github.io/" citation = [] - version = "2024" - license = "competition-specific" - domain = "max_sat" - tags = ["optimization", "max-sat", "weighted-max-sat", "wcnf"] - language = "WCNF" features = FeaturesInfo({ "wcnf_num_variables": ("int", "Number of propositional variables"), "wcnf_num_clauses": ("int", "Total number of clauses (hard + soft)"), @@ -102,14 +95,6 @@ def __init__( ) - @staticmethod - def _loader(content: str) -> cp.Model: - """ - Loader for MaxSAT Evaluation dataset. - Loads a CPMpy model from raw WCNF content string. - """ - return load_wcnf(content) - def category(self) -> dict: return { "year": self.year, diff --git a/cpmpy/tools/datasets/nurserostering.py b/cpmpy/tools/datasets/nurserostering.py index e9f84bc78..394ca2790 100644 --- a/cpmpy/tools/datasets/nurserostering.py +++ b/cpmpy/tools/datasets/nurserostering.py @@ -49,11 +49,6 @@ class NurseRosteringDataset(FileDataset): # torch.utils.data.Dataset compatible "Rahimian, E., Akartunali, K., and Levine, J. A hybrid integer programming and variable neighbourhood search algorithm to solve nurse rostering problems. European Journal of Operational Research, 2017. 258(2): p. 411-423.", ] - version = "1.0.0" - license = "academic-use" - domain = "scheduling" - tags = ["satisfaction", "nurse-rostering", "scheduling", "timetabling"] - language = "NRP-XML" features = FeaturesInfo({ "horizon": ("int", "Planning horizon in days"), "num_staff": ("int", "Number of nurses / staff members"), @@ -86,25 +81,11 @@ def __init__(self, root: str = ".", transform=None, target_transform=None, downl **kwargs ) - @staticmethod - def reader(file_path, open=open): + def parse(self, instance: os.PathLike): """ - Reader for Nurse Rostering dataset. - Parses a file path directly into a CPMpy model. - For backward compatibility. Consider using read() + load() instead. + Parse a nurse rostering instance into native Python data structures. """ - from cpmpy.tools.io.nurserostering import load_nurserostering - return load_nurserostering(file_path, open=open) - - @staticmethod - def _loader(content: str): - """ - Loader for Nurse Rostering dataset. - Loads a CPMpy model from raw Nurse Rostering content string. - """ - from cpmpy.tools.io.nurserostering import load_nurserostering - # load_nurserostering already supports raw strings - return load_nurserostering(content) + return parse_scheduling_period(instance) def category(self) -> dict: return {} # no categories @@ -416,7 +397,7 @@ def to_dataframes(data): return result -def nurserostering_model(horizon, shifts, staff, days_off, shift_on, shift_off, cover): +def model_nurserostering(horizon, shifts, staff, days_off, shift_on, shift_off, cover): """ Create a CPMpy model for nurserostering. @@ -521,7 +502,7 @@ def nurserostering_model(horizon, shifts, staff, days_off, shift_on, shift_off, data, metadata = dataset[0] print(data) - model, nurse_view = nurserostering_model(**data) + model, nurse_view = model_nurserostering(**data) assert model.solve() print(f"Found optimal solution with penalty of {model.objective_value()}") diff --git a/cpmpy/tools/datasets/opb.py b/cpmpy/tools/datasets/opb.py index be83c2d47..d272aac93 100644 --- a/cpmpy/tools/datasets/opb.py +++ b/cpmpy/tools/datasets/opb.py @@ -35,11 +35,6 @@ class OPBDataset(FileDataset): "Berre, D. L., Parrain, A. The Pseudo-Boolean Evaluation 2011. JSAT, 7(1), 2012.", ] - version = "2024" - license = "competition-specific" - domain = "pseudo_boolean" - tags = ["optimization", "pseudo-boolean", "opb", "combinatorial"] - language = "OPB" features = FeaturesInfo({ "author": ("str", "Author extracted from filename convention"), "opb_num_variables": ("int", "Number of Boolean variables (from OPB header)"), @@ -94,16 +89,6 @@ def __init__( **kwargs ) - @staticmethod - def _loader(content: str): - """ - Loader for OPB dataset. - Loads a CPMpy model from raw OPB content string. - """ - from cpmpy.tools.io.opb import load_opb - # load_opb already supports raw strings - return load_opb(content) - def category(self) -> dict: return { "year": self.year, diff --git a/cpmpy/tools/datasets/psplib.py b/cpmpy/tools/datasets/psplib.py index 74574aa7c..cbeef7e38 100644 --- a/cpmpy/tools/datasets/psplib.py +++ b/cpmpy/tools/datasets/psplib.py @@ -6,7 +6,6 @@ import os import pathlib -import io import zipfile from cpmpy.tools.datasets.core import FileDataset @@ -27,11 +26,6 @@ class PSPLibDataset(FileDataset): # torch.utils.data.Dataset compatible "Kolisch, R., Sprecher, A. PSPLIB - A project scheduling problem library. European Journal of Operational Research, 96(1), 205-216, 1997.", ] - version = "1.0.0" - license = "academic-use" - domain = "scheduling" - tags = ["optimization", "project-scheduling", "rcpsp", "scheduling", "combinatorial"] - language = "PSPLIB" features = FeaturesInfo({ "num_jobs": ("int", "Number of jobs (activities) in the project"), "horizon": ("int", "Planning horizon (maximum makespan upper bound)"), @@ -86,25 +80,11 @@ def __init__(self, root: str = ".", variant: str = "rcpsp", family: str = "j30", **kwargs ) - @staticmethod - def reader(file_path, open=open): + def parse(self, instance: os.PathLike): """ - Reader for PSPLib dataset. - Parses a file path directly into a CPMpy model. - For backward compatibility. Consider using read() + load() instead. + Parse a PSPLIB RCPSP instance into job data and capacities. """ - from cpmpy.tools.io.rcpsp import load_rcpsp - return load_rcpsp(file_path, open=open) - - @staticmethod - def _loader(content: str): - """ - Loader for PSPLib dataset. - Loads a CPMpy model from raw RCPSP content string. - """ - from cpmpy.tools.io.rcpsp import load_rcpsp - # load_rcpsp already supports raw strings - return load_rcpsp(content) + return parse_rcpsp(instance) def category(self) -> dict: return { @@ -202,4 +182,21 @@ def download(self): if __name__ == "__main__": dataset = PSPLibDataset(variant="rcpsp", family="j30", download=True) print("Dataset size:", len(dataset)) - print("Instance 0:", dataset[0]) \ No newline at end of file + print("Instance 0:", dataset[0]) + + +def parse_rcpsp(filename: str): + """ + Parse an RCPSP instance into tabular task data and resource capacities. + """ + from cpmpy.tools.io.rcpsp import _parse_rcpsp + with open(filename, "r") as f: + return _parse_rcpsp(f) + + +def model_rcpsp(job_data, capacities): + """ + Build a CPMpy RCPSP model from parsed task data and capacities. + """ + from cpmpy.tools.io.rcpsp import _model_rcpsp + return _model_rcpsp(job_data=job_data, capacities=capacities) \ No newline at end of file diff --git a/cpmpy/tools/datasets/sat.py b/cpmpy/tools/datasets/sat.py index 62534159e..61db1cda9 100644 --- a/cpmpy/tools/datasets/sat.py +++ b/cpmpy/tools/datasets/sat.py @@ -10,7 +10,6 @@ import os import pathlib import re -import tempfile from urllib.request import Request, urlopen from cpmpy.tools.datasets.core import FileDataset @@ -29,11 +28,7 @@ class SATDataset(FileDataset): description = "SAT competition benchmark instances (DIMACS CNF) from benchmark-database.de." homepage = "https://benchmark-database.de/" citation = [] - version = "2025" - license = "competition-specific" - domain = "sat" - tags = ["satisfaction", "sat", "cnf", "dimacs"] - language = "DIMACS-CNF" + features = FeaturesInfo({ "dimacs_num_variables": ("int", "Number of propositional variables from DIMACS p-line"), "dimacs_num_clauses": ("int", "Number of clauses from DIMACS p-line"), @@ -75,17 +70,6 @@ def __init__( **kwargs, ) - @staticmethod - def _loader(content: str): - from cpmpy.tools.io.dimacs import load_dimacs - with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".cnf") as tmp: - tmp.write(content) - tmp_path = tmp.name - try: - return load_dimacs(tmp_path) - finally: - os.unlink(tmp_path) - def category(self) -> dict: return {"track": self.track, "context": self.context} diff --git a/cpmpy/tools/datasets/transforms.py b/cpmpy/tools/datasets/transforms.py index 0dd253f77..eca05f38d 100644 --- a/cpmpy/tools/datasets/transforms.py +++ b/cpmpy/tools/datasets/transforms.py @@ -263,8 +263,8 @@ class Load: Arguments: loader (callable): A loader function that takes raw content string and - returns a CPMpy model. Can be a dataset's ``loader`` method or a - loader function that supports raw strings (e.g., ``load_wcnf``, + returns a CPMpy model. Usually imported from ``cpmpy.tools.io`` and + supporting raw string input (e.g., ``load_wcnf``, ``load_opb``, ``load_xcsp3``, etc.). open (callable, optional): A callable to open files for reading. Typically ``dataset.open``. Defaults to Python's built-in ``open``. @@ -272,11 +272,7 @@ class Load: Example:: - >>> # Using dataset's loader method - >>> dataset = MSEDataset(transform=Load(dataset.loader, open=dataset.open)) - >>> model, metadata = dataset[0] - - >>> # Using a loader function that supports raw strings + >>> # Using an io loader function >>> from cpmpy.tools.io.wcnf import load_wcnf >>> dataset = MSEDataset(transform=Load(load_wcnf, open=dataset.open)) >>> model, metadata = dataset[0] @@ -376,9 +372,9 @@ class Translate: Arguments: loader (callable): A loader function that takes raw content string and - returns a CPMpy model. Can be a dataset's ``loader`` method or a - loader function that supports raw strings (e.g., ``load_wcnf``, - ``read_opb``, ``read_xcsp3``, etc.). + returns a CPMpy model. Usually imported from ``cpmpy.tools.io`` and + supporting raw string input (e.g., ``load_wcnf``, ``load_opb``, + ``load_xcsp3``, etc.). writer (callable or str): Either a writer function (e.g., ``write_dimacs``, ``write_opb``) or a format name string (e.g., ``"dimacs"``, ``"mps"``) that will be resolved to the appropriate writer function. @@ -388,14 +384,15 @@ class Translate: Example:: + >>> from cpmpy.tools.io.wcnf import load_wcnf >>> # Using format name string - >>> transform = Translate(dataset.loader, "dimacs", open=dataset.open) + >>> transform = Translate(load_wcnf, "dimacs", open=dataset.open) >>> dataset = MSEDataset(transform=transform) >>> dimacs_string, metadata = dataset[0] >>> # Using writer function directly >>> from cpmpy.tools.io.dimacs import write_dimacs - >>> transform = Translate(dataset.loader, write_dimacs, open=dataset.open) + >>> transform = Translate(load_wcnf, write_dimacs, open=dataset.open) >>> dataset = MSEDataset(transform=transform) >>> dimacs_string, metadata = dataset[0] >>> metadata['variables'] # from the intermediate model diff --git a/cpmpy/tools/datasets/xcsp3.py b/cpmpy/tools/datasets/xcsp3.py index 5d9a05ce0..d305856ac2 100644 --- a/cpmpy/tools/datasets/xcsp3.py +++ b/cpmpy/tools/datasets/xcsp3.py @@ -10,7 +10,6 @@ import pathlib import io -import cpmpy as cp from cpmpy.tools.datasets.core import FileDataset from cpmpy.tools.datasets.metadata import FeaturesInfo @@ -36,11 +35,6 @@ class XCSP3Dataset(FileDataset): # torch.utils.data.Dataset compatible "Audemard, G., Boussemart, F., Lecoutre, C., Piette, C., Tabary, S. XCSP3: An Integrated Format for Benchmarking Combinatorial Constrained Problems. arXiv:2009.00514, 2020.", ] - version = "2024" - license = "competition-specific" - domain = "constraint_programming" - tags = ["satisfaction", "optimization", "xcsp3", "combinatorial"] - language = "XCSP3" features = FeaturesInfo({ "instance_type": ("str", "Problem type declared in the XML root element (CSP, COP, WCSP, …)"), "xcsp_format": ("str", "XCSP format version string from the XML header"), @@ -71,16 +65,6 @@ def __init__(self, root: str = ".", year: int = 2024, track: str = "CSP", transf ) - @staticmethod - def _loader(content: str) -> cp.Model: - """ - Loader for XCSP3 dataset. - Loads a CPMpy model from raw XCSP3 content string. - """ - from cpmpy.tools.xcsp3.parser import load_xcsp3 - # load_xcsp3 already supports raw strings - return load_xcsp3(content) - def category(self) -> dict: return { "year": self.year, From f4dd1ab9cd30c6299385bed1f8d00ec403d1fbbf Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 11 Mar 2026 11:26:41 +0100 Subject: [PATCH 149/152] Add / update docs --- docs/api/tools/benchmark_runner.rst | 251 ++++++++++++++ docs/api/tools/benchmarks.rst | 208 ++++++++++++ docs/api/tools/datasets.rst | 491 ++++++++++++++++++++++++++++ docs/api/tools/dimacs.rst | 4 +- docs/api/tools/readers.rst | 183 +++++++++++ docs/api/tools/writers.rst | 181 ++++++++++ docs/api/tools/xcsp3.rst | 7 - docs/dataset_authoring.md | 185 +++++++++++ docs/datasets.md | 283 ++++++++++++++++ docs/index.rst | 6 + docs/instance_metadata.md | 117 ++++++- docs/reading_and_writing.md | 50 +-- docs/transforms_guide.md | 172 ++++++++++ 13 files changed, 2099 insertions(+), 39 deletions(-) create mode 100644 docs/api/tools/benchmark_runner.rst create mode 100644 docs/api/tools/benchmarks.rst create mode 100644 docs/api/tools/datasets.rst create mode 100644 docs/api/tools/readers.rst create mode 100644 docs/api/tools/writers.rst create mode 100644 docs/dataset_authoring.md create mode 100644 docs/datasets.md create mode 100644 docs/transforms_guide.md diff --git a/docs/api/tools/benchmark_runner.rst b/docs/api/tools/benchmark_runner.rst new file mode 100644 index 000000000..84d33f85e --- /dev/null +++ b/docs/api/tools/benchmark_runner.rst @@ -0,0 +1,251 @@ +Benchmark Runner (:mod:`cpmpy.tools.benchmark.runner`) +===================================================== + +The benchmark runner provides functionality to execute benchmarks across multiple instances +in parallel, with proper resource management, result collection, and CSV output generation. + +Overview +-------- + +The benchmark runner module provides: + +- **Parallel execution**: Run multiple instances concurrently +- **Resource management**: Time and memory limits per instance +- **Result collection**: Structured CSV output with instance metadata +- **Progress tracking**: Progress bars and status reporting +- **Error isolation**: Each instance runs in isolation to prevent crashes + +Basic Usage +----------- + +The simplest way to run a benchmark across a dataset: + +.. code-block:: python + + from cpmpy.tools.benchmark.runner import benchmark_runner + from cpmpy.tools.benchmark.opb import OPBBenchmark + from cpmpy.tools.datasets import OPBDataset + + # Load dataset + dataset = OPBDataset(root=".", year=2023, download=True) + + # Create benchmark instance + benchmark = OPBBenchmark() + + # Run benchmark across all instances + output_file = benchmark_runner( + dataset=dataset, + instance_runner=benchmark, + output_file="results.csv", + solver="ortools", + workers=4, + time_limit=300, + mem_limit=4096 + ) + +Function Signature +------------------ + +.. code-block:: python + + benchmark_runner( + dataset, # Dataset object + instance_runner, # Benchmark instance + output_file, # Output CSV file path + solver, # Solver name + workers=1, # Number of parallel workers + time_limit=300, # Time limit per instance (seconds) + mem_limit=4096, # Memory limit per instance (MiB) + cores=1, # CPU cores per instance + verbose=False, # Show solver output + intermediate=False, # Report intermediate solutions + checker_path=None, # Path to solution checker + **kwargs # Additional arguments + ) -> str # Returns output file path + +Parameters +---------- + +dataset +~~~~~~~ + +A dataset object (e.g., :class:`XCSP3Dataset`, :class:`OPBDataset`) that provides instances to benchmark. + +instance_runner +~~~~~~~~~~~~~~~ + +A benchmark instance (e.g., :class:`XCSP3Benchmark`, :class:`OPBBenchmark`) that implements the `run()` method. + +output_file +~~~~~~~~~~~ + +Path to the CSV file where results will be written. The file will contain columns for: +- Instance metadata (name, path, category) +- Solver status +- Runtime +- Memory usage +- Objective value (if applicable) +- Other benchmark-specific fields + +solver +~~~~~~ + +Name of the solver to use (e.g., "ortools", "gurobi", "z3"). + +workers +~~~~~~~ + +Number of parallel processes to run instances. Default is 1 (sequential execution). + +time_limit +~~~~~~~~~~ + +Time limit in seconds for each instance. Default is 300 (5 minutes). + +mem_limit +~~~~~~~~~ + +Memory limit in MiB (1024 * 1024 bytes) per instance. Default is 4096 (4 GB). + +cores +~~~~~ + +Number of CPU cores assigned per instance. Default is 1. + +verbose +~~~~~~~ + +Whether to show solver output in stdout. Default is False. + +intermediate +~~~~~~~~~~~~ + +Whether to report intermediate solutions if supported. Default is False. + +checker_path +~~~~~~~~~~~~ + +Optional path to a solution checker executable for validating instance solutions. + +Example: Running XCSP3 Benchmark +--------------------------------- + +.. code-block:: python + + from cpmpy.tools.benchmark.runner import benchmark_runner + from cpmpy.tools.benchmark.xcsp3 import XCSP3Benchmark + from cpmpy.tools.datasets import XCSP3Dataset + + # Load XCSP3 2024 CSP track dataset + dataset = XCSP3Dataset(root=".", year=2024, track="CSP", download=True) + + # Create benchmark + benchmark = XCSP3Benchmark() + + # Run with 4 parallel workers + output_file = benchmark_runner( + dataset=dataset, + instance_runner=benchmark, + output_file="xcsp3_2024_csp_results.csv", + solver="ortools", + workers=4, + time_limit=600, # 10 minutes per instance + mem_limit=8192, # 8 GB per instance + cores=1, + verbose=False, + intermediate=False + ) + + print(f"Results written to: {output_file}") + +Example: Running OPB Benchmark +------------------------------ + +.. code-block:: python + + from cpmpy.tools.benchmark.runner import benchmark_runner + from cpmpy.tools.benchmark.opb import OPBBenchmark + from cpmpy.tools.datasets import OPBDataset + + # Load OPB 2023 dataset + dataset = OPBDataset(root=".", year=2023, download=True) + + # Create benchmark + benchmark = OPBBenchmark() + + # Run benchmark + output_file = benchmark_runner( + dataset=dataset, + instance_runner=benchmark, + output_file="opb_2023_results.csv", + solver="ortools", + workers=8, + time_limit=300, + mem_limit=4096 + ) + +Parallel Execution +------------------ + +The benchmark runner uses Python's ThreadPoolExecutor for parallel execution: + +- Each instance runs in a separate thread +- Instances are isolated from each other +- Results are collected as they complete +- Progress is tracked with a progress bar (if tqdm is available) + +Resource Management +-------------------- + +Each instance execution: + +- Runs in isolation with its own resource limits +- Has time and memory limits enforced +- Captures stdout/stderr separately +- Handles timeouts gracefully + +Output Format +------------- + +The CSV output file contains columns such as: + +- **instance_name**: Name of the instance +- **instance_path**: Path to the instance file +- **solver**: Solver used +- **status**: Exit status (optimal, sat, unsat, unknown, etc.) +- **runtime**: Runtime in seconds +- **memory**: Peak memory usage in MiB +- **objective**: Objective value (if applicable) +- **timeout**: Whether instance timed out +- **error**: Error message (if any) + +Additional columns may be present depending on the dataset metadata. + +Error Handling +-------------- + +The benchmark runner handles errors gracefully: + +- Failed instances don't stop the benchmark +- Errors are logged in the CSV output +- Timeouts are handled separately from crashes +- Memory errors are caught and reported + +Progress Tracking +----------------- + +If `tqdm` is available, the benchmark runner shows: + +- Progress bar with instance count +- Estimated time remaining +- Current instance being processed + +Without `tqdm`, progress is printed to stdout. + +API Reference +------------- + +.. automodule:: cpmpy.tools.benchmark.runner + :members: + :undoc-members: + :inherited-members: diff --git a/docs/api/tools/benchmarks.rst b/docs/api/tools/benchmarks.rst new file mode 100644 index 000000000..a8c883366 --- /dev/null +++ b/docs/api/tools/benchmarks.rst @@ -0,0 +1,208 @@ +Benchmarks (:mod:`cpmpy.tools.benchmark`) +===================================================== + +CPMpy provides a comprehensive benchmarking framework for running constraint programming benchmarks +across multiple instances and solvers. The benchmark module allows you to systematically evaluate +solver performance with proper resource management, error handling, and result collection. + +Overview +-------- + +The benchmark module provides: + +- **Benchmark base class**: Framework for running individual instances +- **Dataset-specific benchmarks**: Pre-configured benchmarks for XCSP3, OPB, MSE, JSPLib, PSPLib, etc. +- **Resource management**: Time and memory limits with proper cleanup +- **Solver configuration**: Automatic solver parameter configuration +- **Result tracking**: Structured output and intermediate solution reporting + +Basic Usage +----------- + +The simplest way to run a benchmark: + +.. code-block:: python + + from cpmpy.tools.benchmark import Benchmark + from cpmpy.tools.io.opb import read_opb + + # Create a benchmark with a reader + bm = Benchmark(reader=read_opb) + + # Run a single instance + bm.run( + instance="instance.opb", + solver="ortools", + time_limit=30, + mem_limit=1024, + verbose=True + ) + +Available Benchmarks +-------------------- + +CPMpy provides pre-configured benchmarks for various datasets: + +.. list-table:: + :header-rows: 1 + + * - **Benchmark Class** + - **Dataset** + - **Reader** + - **Description** + * - :class:`XCSP3Benchmark ` + - XCSP3Dataset + - read_xcsp3 + - Benchmark for XCSP3 Competition instances + * - :class:`OPBBenchmark ` + - OPBDataset + - read_opb + - Benchmark for Pseudo-Boolean Competition instances + * - :class:`MSEBenchmark ` + - MaxSATEvalDataset + - read_wcnf + - Benchmark for MaxSAT Evaluation instances + * - :class:`JSPLibBenchmark ` + - JSPLibDataset + - read_jsplib + - Benchmark for Job Shop Scheduling instances + * - :class:`PSPLibBenchmark ` + - PSPLibDataset + - read_rcpsp + - Benchmark for Project Scheduling instances + * - :class:`NurseRosteringBenchmark ` + - NurseRosteringDataset + - read_nurserostering + - Benchmark for Nurse Rostering instances + +Using Pre-configured Benchmarks +-------------------------------- + +Example with XCSP3 benchmark: + +.. code-block:: python + + from cpmpy.tools.benchmark.xcsp3 import XCSP3Benchmark + + bm = XCSP3Benchmark() + + bm.run( + instance="instance.xml", + solver="ortools", + time_limit=60, + mem_limit=2048, + cores=4, + verbose=True + ) + +Resource Limits +--------------- + +Benchmarks support both time and memory limits: + +.. code-block:: python + + bm = Benchmark(reader=read_opb) + + bm.run( + instance="instance.opb", + solver="ortools", + time_limit=300, # 5 minutes in seconds + mem_limit=4096, # 4 GB in MiB + cores=1 # Number of CPU cores + ) + +Solver Configuration +--------------------- + +The benchmark framework automatically configures solver parameters. You can customize this: + +.. code-block:: python + + class CustomBenchmark(Benchmark): + def ortools_arguments(self, model, cores=None, seed=None, **kwargs): + res = super().ortools_arguments(model, cores=cores, seed=seed, **kwargs) + # Add custom OR-Tools parameters + res[0]["use_rins_lns"] = True + return res + +Intermediate Solutions +---------------------- + +For optimization problems, you can enable intermediate solution reporting: + +.. code-block:: python + + bm = Benchmark(reader=read_opb) + + bm.run( + instance="instance.opb", + solver="ortools", + time_limit=300, + intermediate=True # Report intermediate solutions + ) + +This will print intermediate objective values as they are found. + +Exit Status +----------- + +Benchmarks return exit statuses indicating the result: + +.. code-block:: python + + from cpmpy.tools.benchmark import ExitStatus + + # ExitStatus.optimal: Optimal solution found (COP) + # ExitStatus.sat: Solution found but not proven optimal (CSP/COP) + # ExitStatus.unsat: Instance is unsatisfiable + # ExitStatus.unsupported: Instance contains unsupported features + # ExitStatus.unknown: Any other case + +Custom Benchmarks +----------------- + +Create custom benchmarks by inheriting from the Benchmark base class: + +.. code-block:: python + + from cpmpy.tools.benchmark import Benchmark + from cpmpy.tools.io.opb import read_opb + + class MyBenchmark(Benchmark): + def print_result(self, s): + # Custom result printing + print(f"Custom result: {s.status()}") + + def handle_exception(self, e): + # Custom error handling + print(f"Custom error: {e}") + super().handle_exception(e) + +Error Handling +-------------- + +The benchmark framework handles various error conditions: + +- **MemoryError**: When memory limit is exceeded +- **TimeoutError**: When time limit is exceeded +- **NotImplementedError**: When instance contains unsupported features +- **Other exceptions**: General error handling with stack traces + +All errors are properly handled and reported through callback methods. + +Signal Handling +--------------- + +Benchmarks properly handle system signals: + +- **SIGTERM/SIGINT**: Graceful termination +- **SIGXCPU**: CPU time limit exceeded (Unix only) + +API Reference +------------- + +.. automodule:: cpmpy.tools.benchmark._base + :members: + :undoc-members: + :inherited-members: diff --git a/docs/api/tools/datasets.rst b/docs/api/tools/datasets.rst new file mode 100644 index 000000000..b003f26ea --- /dev/null +++ b/docs/api/tools/datasets.rst @@ -0,0 +1,491 @@ +Datasets (:mod:`cpmpy.tools.datasets`) +======================================= + +CPMpy provides a PyTorch-style dataset interface for loading and iterating over +benchmark instance collections. Each dataset handles downloading, file discovery, +metadata collection, and decompression automatically. + +For worked, narrative guides (Markdown), see: + +- :doc:`/datasets` (quickstart + common pipelines) +- :doc:`/instance_metadata` (the metadata system) +- :doc:`/transforms_guide` (transform pipelines, enrichment, analytics) +- :doc:`/dataset_authoring` (implementing new datasets/loaders/writers) +- :doc:`/benchmarking_workflows` (dataset-driven experiments) +- :doc:`/reading_and_writing` (IO loaders/writers + translation workflows) + +Basic Usage +----------- + +Create a dataset and iterate over ``(file_path, info)`` pairs: + +.. code-block:: python + + from cpmpy.tools.datasets import JSPLibDataset + + dataset = JSPLibDataset(root="./data", download=True) + + for file_path, info in dataset: + print(info["name"], info["jobs"], "×", info["machines"]) + +The second element ``info`` is an :class:`InstanceInfo +` — a dict subclass with additional +structured properties. + +Available Datasets +------------------ + +.. list-table:: + :header-rows: 1 + + * - **Class** + - **Domain** + - **Format** + * - :class:`XCSP3Dataset ` + - CP / COP + - XCSP3 + * - :class:`OPBDataset ` + - Pseudo-Boolean + - OPB + * - :class:`MaxSATEvalDataset ` + - MaxSAT + - WCNF + * - :class:`JSPLibDataset ` + - Job Shop Scheduling + - JSPLib + * - :class:`PSPLibDataset ` + - Project Scheduling + - PSPLib + * - :class:`NurseRosteringDataset ` + - Nurse Rostering + - NRP + * - :class:`MIPLibDataset ` + - Mixed Integer Programming + - MPS + * - :class:`SATDataset ` + - SAT + - DIMACS (CNF) + +Loading into CPMpy Models +-------------------------- + +Use an IO loader as the dataset ``transform`` (or wrap it with ``Load``): + +.. code-block:: python + + from cpmpy.tools.datasets import XCSP3Dataset + from cpmpy.tools.datasets.transforms import Load + from cpmpy.tools.io import load_xcsp3 + + dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) + dataset.transform = Load(load_xcsp3, open=dataset.open) + + for model, info in dataset: + model.solve() + vars = info.model_objects["variables"] + print({name: v.value() for name, v in vars.items()}) + +Reading and writing with compression +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Datasets use :meth:`~cpmpy.tools.datasets.core.FileDataset.open` for reading +(and thus decompression): e.g. ``dataset.open(path)`` returns a text stream +for ``.xml.lzma`` or ``.opb.xz``. Writers mirror this with an ``open`` +parameter: pass a callable that accepts ``(path, mode="w")`` and returns a +text stream. Use any Python code you like (for example ``lzma.open`` or +``gzip.open``) to write compressed output: + +.. code-block:: python + + import lzma + from cpmpy.tools.io.dimacs import write_dimacs + from cpmpy.tools.io import write_opb + + xz_text = lambda path, mode="w": lzma.open(path, "wt") + write_opb(model, "out.opb.xz", open=xz_text) + write_dimacs(model, "out.cnf.xz", open=xz_text) + +Loaders (e.g. :func:`cpmpy.tools.io.dimacs.load_dimacs`, :func:`cpmpy.tools.io.opb.load_opb`) +also accept an ``open`` parameter for decompression on read. + +Transforms +---------- + +Transforms are applied to the file path on each iteration. Set them on the +``transform`` attribute or pass them to the constructor. + +.. list-table:: + :header-rows: 1 + + * - **Class** + - **Description** + * - :class:`Open ` + - Read raw file contents (handles decompression) + * - :class:`Load ` + - Load file into a CPMpy model; enriches metadata with model statistics and ``variables`` + * - :class:`Serialize ` + - Serialize a CPMpy model to a format string + * - :class:`Translate ` + - Load then serialize in one step (format translation) + * - :class:`SaveToFile ` + - Write output to disk; optionally writes ``.meta.json`` sidecars + * - :class:`Lambda ` + - Wrap any callable as a named transform + * - :class:`Compose ` + - Chain multiple transforms sequentially + +.. code-block:: python + + from cpmpy.tools.datasets.transforms import Compose, Translate, SaveToFile + from cpmpy.tools.io import load_xcsp3 + + dataset.transform = Compose([ + Translate(load_xcsp3, "opb", open=dataset.open), + SaveToFile("./translated/", extension=".opb", write_metadata=True), + ]) + + for output_path, info in dataset: + print("Saved:", output_path) + +Instance Metadata (``InstanceInfo``) +------------------------------------- + +``InstanceInfo`` is a ``dict`` subclass — all existing dict access is unchanged. +Structured properties partition the flat dict into four named groups: + +.. list-table:: + :header-rows: 1 + + * - **Property** + - **Contents** + - **Serializable** + * - ``domain_metadata`` + - Problem-level fields: ``jobs``, ``machines``, ``horizon``, … + - ✅ + * - ``format_metadata`` + - Format-specific fields: ``opb_*``, ``wcnf_*``, ``mps_*``, … + - ✅ + * - ``model_features`` + - CP model statistics: variable counts, constraint counts, objective info + - ✅ + * - ``model_objects`` + - Live CPMpy objects: ``variables`` map — in-memory only after ``Load`` + - ❌ + +.. code-block:: python + + file, info = dataset[0] + + # Dict access (unchanged) + info["name"] + info.get("jobs", 0) + + # Structured properties + info.id # "jsplib/abz5" + info.domain_metadata # {"jobs": 10, "machines": 10, …} + info.model_features # {"num_variables": …} — after Load + info.model_objects # {"variables": {…}} — after Load + +Metadata Enrichment +-------------------- + +``InstanceInfo`` supports two common enrichment patterns. The most frequent +case is simply adding computed fields. Use the ``|`` operator to merge any +dict into the metadata — everything already in ``info`` is preserved, and +the result is still an ``InstanceInfo`` so all structured properties keep +working: + +.. code-block:: python + + for file_path, info in dataset: + enriched = info | {"density": info["jobs"] / info["machines"]} + print(enriched.domain_metadata) # includes the new "density" field + +To have enrichment happen automatically on every iteration without touching +the loop, pass a ``target_transform``. It receives each ``InstanceInfo`` +and its return value replaces the metadata for that item: + +.. code-block:: python + + dataset = JSPLibDataset( + root="./data", + target_transform=lambda info: info | { + "density": info["jobs"] / info["machines"], + "has_optimum": info.get("optimum") is not None, + }, + ) + + for file_path, info in dataset: + print(info["density"]) # already computed, no extra code needed + +The second pattern arises when a transform changes the file format — for +example, translating a WCNF instance to OPB. The old format-specific fields +(``wcnf_*``) are now stale and should be dropped, while new ones (``opb_*``) +should be added. ``without_format()`` strips all format-prefixed fields and +carries everything else forward; chain it with ``|`` to attach the new ones: + +.. code-block:: python + + from cpmpy.tools.datasets.transforms import extract_format_metadata + + # "jobs" and other domain fields survive; wcnf_* are removed; opb_* are added + new_info = info.without_format() | extract_format_metadata(opb_string, "opb") + +For a full explanation with more examples and use cases, see +:doc:`/instance_metadata`. + +.. _datasets_advanced_metadata: + +Advanced Metadata System (Placeholder) +-------------------------------------- + +This section is intentionally reserved for an in-depth guide on the metadata +system used by ``InstanceInfo``, ``DatasetInfo``, ``FeaturesInfo``, and +``FieldInfo``. + +Planned content includes: + +- detailed metadata lifecycle (collection, sidecar storage, loading, enrichment) +- domain vs format-specific vs model-feature metadata boundaries +- schema design guidelines with ``FeaturesInfo`` and ``FieldInfo`` +- dtype normalisation (canonical strings, Python types, schema.org types) +- JSON serialisation contracts (``to_dict``, ``to_jsonable``, ``to_json``) +- export mappings and constraints for Croissant and dataset cards +- recommendations for robust metadata validation and versioning + +Until this section is expanded, use the ``Instance Metadata`` and +``Dataset-Level Metadata`` sections in this page, and the +``cpmpy.tools.datasets.metadata`` API reference below. + +Dataset-Level Metadata +----------------------- + +Every dataset class exposes a :class:`DatasetInfo +` with name, version, license, tags, +citation, and the instance field schema: + +.. code-block:: python + + info = JSPLibDataset.dataset_metadata() # classmethod — no download needed + + info.name # "jsplib" + info.version # "1.0.0" + info.license # "MIT" + info.features # FeaturesInfo with field schema + + # HuggingFace-style dataset card + print(JSPLibDataset.card()) + + # MLCommons Croissant JSON-LD + import json + print(json.dumps(JSPLibDataset.dataset_metadata().to_croissant(), indent=2)) + +Creating a Custom Dataset +-------------------------- + +For a complete authoring guide (design patterns, metadata conventions, and +implementation checklist), see :doc:`/dataset_authoring`. + +Subclass :class:`FileDataset ` and +implement the required abstract methods. A **minimal** dataset needs only the +class-level name/description/homepage attributes and three methods: + +.. code-block:: python + + from cpmpy.tools.datasets import FileDataset + + + class MinimalDataset(FileDataset): + + name = "minimal" + description = "Minimal example dataset." + homepage = "https://example.com/minimal" + + def parse(self, instance): + """Optional parse-first hook.""" + return self.read(instance) + + def category(self) -> dict: + return {} # or {"year": ..., "track": ...} + + def download(self): + ... # download files to self.dataset_dir + +An **enriched** dataset adds optional metadata fields and a custom ``__init__`` +to control the dataset directory and extension: + +.. code-block:: python + + import pathlib + from cpmpy.tools.datasets import FileDataset + from cpmpy.tools.datasets.metadata import FeaturesInfo, FieldInfo + + + class MyDataset(FileDataset): + + name = "mydataset" + description = "A short description." + homepage = "https://example.com/mydataset" + citation = ["Author et al. My Dataset. Journal, 2024."] + + version = "1.0.0" + license = "CC BY 4.0" + domain = "constraint_programming" + tags = ["combinatorial", "satisfaction"] + language = "MyFormat" + release_notes = {"1.0.0": "Initial release."} + + features = FeaturesInfo({ + "num_vars": ("int", "Number of decision variables"), + "optimum": FieldInfo("int", "Known optimal value", nullable=True), + }) + + def __init__(self, root=".", transform=None, target_transform=None, + download=False): + super().__init__( + dataset_dir=pathlib.Path(root) / self.name, + transform=transform, target_transform=target_transform, + download=download, extension=".txt", + ) + + def parse(self, instance): + """Optional parse-first hook.""" + return self.read(instance) + + def category(self) -> dict: + return {} + + def download(self): + ... # download files to self.dataset_dir + + def collect_instance_metadata(self, file) -> dict: + return {"num_vars": ...} + +Field Type Normalisation (``FieldInfo.dtype``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``FieldInfo`` accepts canonical dtype strings, schema.org dtype strings, or +Python types. +Internally, values are normalised to canonical strings so metadata schemas are +stable and JSON-serialisable. These canonical dtypes are also mapped to +schema.org ``dataType`` values for Croissant export. + +Unknown dtype strings or unsupported Python types raise an exception. + +.. list-table:: + :header-rows: 1 + + * - Canonical dtype + - Accepted schema.org dtype + - Accepted Python type + - Croissant/schema.org ``dataType`` + * - ``"int"`` + - ``"sc:Integer"`` + - ``int`` + - ``sc:Integer`` + * - ``"float"`` + - ``"sc:Float"`` + - ``float`` + - ``sc:Float`` + * - ``"str"`` + - ``"sc:Text"`` + - ``str`` + - ``sc:Text`` + * - ``"bool"`` + - ``"sc:Boolean"`` + - ``bool`` + - ``sc:Boolean`` + * - ``"dict"`` + - ``"sc:StructuredValue"`` + - ``dict`` + - ``sc:StructuredValue`` + * - ``"list"`` + - ``"sc:ItemList"`` + - ``list`` + - ``sc:ItemList`` + +.. code-block:: python + + features = FeaturesInfo({ + "jobs": int, # normalised to "int" + "deadline": "sc:Integer", # schema.org string also accepted + "machines": ("int", "Number of machines"), # canonical string form + "optimum": FieldInfo(float, "Best value", nullable=True), + }) + +To extend an existing dataset, subclass it and declare only the new fields — +the framework merges parent and child schemas automatically: + +.. code-block:: python + + class DifficultyJSP(JSPLibDataset): + features = FeaturesInfo({ + "difficulty": FieldInfo("float", "Makespan / num_jobs ratio", nullable=True), + }) + + def collect_instance_metadata(self, file) -> dict: + meta = super().collect_instance_metadata(file) + jobs = meta.get("jobs", 1) + bound = meta.get("optimum") or meta.get("bounds", {}).get("upper") + if bound and jobs: + meta["difficulty"] = round(bound / jobs, 3) + return meta + +Writing a Custom Transform +--------------------------- + +Implement ``__call__`` for the data transformation and optionally +``enrich_metadata`` to update instance metadata based on the output: + +.. code-block:: python + + class MyTransform: + + def __call__(self, file_path): + """Transform the data. Return the new data value.""" + ... + + def enrich_metadata(self, data, metadata): + """ + Update metadata based on __call__'s output. + Called automatically by the dataset after __call__. + Returns an updated InstanceInfo. + """ + return metadata | {"my_field": compute(data)} + +For format-changing transforms use ``without_format()`` to drop old format fields: + +.. code-block:: python + + def enrich_metadata(self, data, metadata): + return metadata.without_format() | extract_format_metadata(data, "opb") + +.. _datasets_advanced_authoring: + +Advanced Dataset Authoring (Placeholder) +---------------------------------------- + +This placeholder section has been superseded by the Markdown guides: + +- :doc:`/dataset_authoring` +- :doc:`/transforms_guide` +- :doc:`/benchmarking_workflows` + +API Reference +------------- + +.. automodule:: cpmpy.tools.datasets + :members: + :undoc-members: + +.. automodule:: cpmpy.tools.datasets.metadata + :members: + :undoc-members: + +.. automodule:: cpmpy.tools.datasets.transforms + :members: + :undoc-members: + +.. automodule:: cpmpy.tools.datasets.core + :members: + :undoc-members: diff --git a/docs/api/tools/dimacs.rst b/docs/api/tools/dimacs.rst index 8b785778c..189a279a4 100644 --- a/docs/api/tools/dimacs.rst +++ b/docs/api/tools/dimacs.rst @@ -1,7 +1,7 @@ -DIMACS (:mod:`cpmpy.tools.dimacs`) +DIMACS (:mod:`cpmpy.tools.io.dimacs`) ===================================================== -.. automodule:: cpmpy.tools.dimacs +.. automodule:: cpmpy.tools.io.dimacs :members: :undoc-members: :inherited-members: \ No newline at end of file diff --git a/docs/api/tools/readers.rst b/docs/api/tools/readers.rst new file mode 100644 index 000000000..8287feb51 --- /dev/null +++ b/docs/api/tools/readers.rst @@ -0,0 +1,183 @@ +Loaders (:mod:`cpmpy.tools.io`) +=============================== + +CPMpy provides loaders for various constraint programming and optimization file +formats. All loaders accept either a file path or a raw content string, and +return a :class:`cpmpy.Model` ready to solve. + +Basic Usage +----------- + +A unified ``load()`` function auto-detects the format from the file extension: + +.. code-block:: python + + from cpmpy.tools.io import load + + model = load("instance.opb") + model = load("instance.cnf") + model = load("problem.mps") + + # Explicit format when the extension is ambiguous + model = load("instance.txt", format="opb") + + model.solve() + +Supported Formats +----------------- + +.. list-table:: + :header-rows: 1 + + * - **Format** + - **Extension** + - **Loader function** + - **Dependencies** + * - OPB + - ``.opb`` + - :func:`load_opb ` + - — + * - WCNF + - ``.wcnf`` + - :func:`load_wcnf ` + - — + * - DIMACS + - ``.cnf`` + - :func:`load_dimacs ` + - — + * - MPS + - ``.mps`` + - :func:`load_scip ` + - pyscipopt + * - LP + - ``.lp`` + - :func:`load_scip ` + - pyscipopt + * - CIP + - ``.cip`` + - :func:`load_scip ` + - pyscipopt + * - FZN + - ``.fzn`` + - :func:`load_scip ` + - pyscipopt + * - GMS + - ``.gms`` + - :func:`load_scip ` + - pyscipopt + * - PIP + - ``.pip`` + - :func:`load_scip ` + - pyscipopt + * - XCSP3 + - ``.xml`` + - :func:`load_xcsp3 ` + - — + * - JSPLib + - (none) + - :func:`load_jsplib ` + - — + * - PSPLib (RCPSP) + - ``.sm`` + - :func:`load_rcpsp ` + - — + * - Nurse Rostering + - ``.txt`` + - :func:`load_nurserostering ` + - — + +Format-Specific Loaders +----------------------- + +All format-specific loaders accept a file path *or* a raw content string. +This makes them usable both for on-disk files and for programmatically generated +or in-memory content. + +.. code-block:: python + + # Load from file + from cpmpy.tools.io.opb import load_opb + model = load_opb("instance.opb") + + # Load from raw string + content = "* #variable= 2 #constraint= 1\nx1 + x2 >= 1 ;" + model = load_opb(content) + +.. code-block:: python + + from cpmpy.tools.io.wcnf import load_wcnf + model = load_wcnf("instance.wcnf") + +.. code-block:: python + + from cpmpy.tools.io.dimacs import load_dimacs + model = load_dimacs("instance.cnf") + +.. code-block:: python + + import lzma + from cpmpy.tools.io.xcsp3 import load_xcsp3 + model = load_xcsp3("instance.xml.lzma", open=lzma.open) + +.. code-block:: python + + # MPS / LP / CIP / FZN / GMS / PIP (require pyscipopt) + from cpmpy.tools.io.scip import load_scip + model = load_scip("instance.mps") + model = load_scip("instance.lp") + model = load_scip("instance.fzn") + +.. code-block:: python + + from cpmpy.tools.io.jsplib import load_jsplib + model = load_jsplib("instance") # Job Shop Scheduling + +.. code-block:: python + + from cpmpy.tools.io.rcpsp import load_rcpsp + model = load_rcpsp("instance.sm") # Resource-Constrained Project Scheduling + +.. code-block:: python + + from cpmpy.tools.io.nurserostering import load_nurserostering + model = load_nurserostering("instance.txt") + +Compressed Files +---------------- + +All loaders accept a custom ``open`` callable for transparent decompression: + +.. code-block:: python + + import lzma + from cpmpy.tools.io.opb import load_opb + + model = load_opb("instance.opb.xz", open=lzma.open) + +The same pattern applies to other loaders. For example, DIMACS CNF: + +.. code-block:: python + + import lzma + from cpmpy.tools.io.dimacs import load_dimacs + + model = load_dimacs("instance.cnf.xz", open=lambda p, mode="r": lzma.open(p, "rt")) + +Datasets handle this automatically via ``dataset.open``. See +:doc:`datasets` and :doc:`/reading_and_writing` for details. + +Listing Available Formats +-------------------------- + +.. code-block:: python + + from cpmpy.tools.io import read_formats + print(read_formats()) + # ['mps', 'lp', 'cip', 'fzn', 'gms', 'pip', 'dimacs', 'opb', 'wcnf'] + +API Reference +------------- + +.. automodule:: cpmpy.tools.io.reader + :members: + :undoc-members: diff --git a/docs/api/tools/writers.rst b/docs/api/tools/writers.rst new file mode 100644 index 000000000..62d7f0071 --- /dev/null +++ b/docs/api/tools/writers.rst @@ -0,0 +1,181 @@ +Writers (:mod:`cpmpy.tools.io`) +================================ + +CPMpy can serialize models to various file formats for use with external solvers +or for format translation. All writers accept an optional ``file_path``; omitting +it (or passing ``None``) returns the result as a string. + +Basic Usage +----------- + +A unified ``write()`` function auto-detects the format from the file extension: + +.. code-block:: python + + import cpmpy as cp + from cpmpy.tools.io import write + + x = cp.intvar(0, 10, name="x") + y = cp.intvar(0, 10, name="y") + model = cp.Model([x + y <= 5], minimize=x + y) + + write(model, "output.opb") # format from extension + write(model, "output.mps") + write(model, "output.cnf") + + # Explicit format + write(model, "output.txt", format="opb") + + # Write to string (no file) + opb_string = write(model, format="opb") + +Supported Formats +----------------- + +.. list-table:: + :header-rows: 1 + + * - **Format** + - **Extension** + - **Writer function** + - **Dependencies** + * - OPB + - ``.opb`` + - :func:`write_opb ` + - — + * - DIMACS + - ``.cnf`` + - :func:`write_dimacs ` + - — + * - MPS + - ``.mps`` + - :func:`write_scip ` + - pyscipopt + * - LP + - ``.lp`` + - :func:`write_scip ` + - pyscipopt + * - CIP + - ``.cip`` + - :func:`write_scip ` + - pyscipopt + * - FZN + - ``.fzn`` + - :func:`write_scip ` + - pyscipopt + * - GMS + - ``.gms`` + - :func:`write_scip ` + - pyscipopt + * - PIP + - ``.pip`` + - :func:`write_scip ` + - pyscipopt + +Format-Specific Writers +----------------------- + +All writers return the serialized string when no ``file_path`` (or ``fname``) is +given, making them suitable for use inside dataset transforms. + +.. code-block:: python + + from cpmpy.tools.io.opb import write_opb + + write_opb(model, "output.opb") # write to file + opb_string = write_opb(model) # return as string + +.. code-block:: python + + from cpmpy.tools.io.dimacs import write_dimacs + + write_dimacs(model, "output.cnf") + cnf_string = write_dimacs(model) + +Compressed output +----------------- + +Writers mirror the loader convention: many format-specific writers accept an +optional ``open`` callable. This allows you to write compressed output (or use +any custom I/O) without CPMpy guessing what compression you want. + +.. code-block:: python + + import lzma + from cpmpy.tools.io.opb import write_opb + + xz_text = lambda path, mode="w": lzma.open(path, "wt") + write_opb(model, "output.opb.xz", open=xz_text) + +.. code-block:: python + + import lzma + from cpmpy.tools.io.dimacs import write_dimacs + + xz_text = lambda path, mode="w": lzma.open(path, "wt") + write_dimacs(model, "output.cnf.xz", open=xz_text) + +.. code-block:: python + + # MPS / LP / CIP / FZN / GMS / PIP (require pyscipopt) + from cpmpy.tools.io.scip import write_scip + + write_scip(model, "output.mps", format="mps") + write_scip(model, "output.fzn", format="fzn") + mps_string = write_scip(model, format="mps") # return as string + + # Compressed output via open= + import lzma + xz_text = lambda path, mode="w": lzma.open(path, "wt") + write_scip(model, "output.mps.xz", format="mps", open=xz_text) + +Format Limitations +------------------ + +- **DIMACS**: Boolean variables and CNF constraints only. +- **OPB**: Linear constraints and integer variables. +- **MPS/LP**: Linear and integer constraints. +- **FZN**: MiniZinc-compatible constraints. + +Models containing unsupported features will raise an exception at write time. + +Checking Writer Dependencies +----------------------------- + +.. code-block:: python + + from cpmpy.tools.io.writer import writer_dependencies + + print(writer_dependencies("mps")) + # {'pyscipopt': '0.4.8'} — package name → installed version + +Listing Available Formats +-------------------------- + +.. code-block:: python + + from cpmpy.tools.io import write_formats + print(write_formats()) + # ['mps', 'lp', 'cip', 'fzn', 'gms', 'pip', 'dimacs', 'opb'] + +Converting Between Formats +--------------------------- + +Load a file in one format and write it in another: + +.. code-block:: python + + from cpmpy.tools.io import load, write + + model = load("input.opb") + write(model, "output.mps") + +For bulk format translation across a dataset, see :doc:`/reading_and_writing` +and the ``Translate`` transform in :doc:`datasets`. + +API Reference +------------- + +.. automodule:: cpmpy.tools.io.writer + :members: + :undoc-members: diff --git a/docs/api/tools/xcsp3.rst b/docs/api/tools/xcsp3.rst index ad3bd0868..0010c8c1d 100644 --- a/docs/api/tools/xcsp3.rst +++ b/docs/api/tools/xcsp3.rst @@ -6,10 +6,3 @@ XCSP3 (:mod:`cpmpy.tools.xcsp3`) :undoc-members: :inherited-members: -.. include:: ./xcsp3/analyze.rst -.. include:: ./xcsp3/dataset.rst -.. include:: ./xcsp3/globals.rst -.. include:: ./xcsp3/solution.rst -.. include:: ./xcsp3/cli.rst -.. include:: ./xcsp3/benchmark.rst - diff --git a/docs/dataset_authoring.md b/docs/dataset_authoring.md new file mode 100644 index 000000000..f71341f99 --- /dev/null +++ b/docs/dataset_authoring.md @@ -0,0 +1,185 @@ +--- +title: Dataset authoring +--- + +# Dataset authoring + +This guide explains how to implement new datasets for `cpmpy.tools.datasets` +in a way that is consistent with the rest of the ecosystem: + +- stable instance identifiers (`info.id`) +- structured instance metadata (`InstanceInfo`) +- dataset cards and Croissant export (`DatasetInfo`) +- sidecar metadata collection (`.meta.json`) +- PyTorch compatibility (`__len__`, `__getitem__`, `transform`, `target_transform`) + +If you only want to *use* existing datasets, start with [](datasets.md). + +## Design principles + +### (1) Stable instance IDs + +Every instance should have a stable identifier. For file-based datasets, the +default `FileDataset` behavior uses the instance file path string as the `id`. + +If your dataset is not file-based (or uses nested structures), decide and +document what uniquely identifies an instance. The guiding rule is: + +> The dataset class should define what the instance identifier means. + +### (2) Metadata fields are a contract + +Metadata is a flat dict. The important part is that it is **predictable**: + +- problem-level fields: jobs, machines, horizon, … +- format-level fields: opb_*, wcnf_*, dimacs_*, … +- model-level fields: number of variables, constraints, objective info, … + +Use `FeaturesInfo` / `FieldInfo` to document the fields your dataset provides. + +## Minimal dataset: the required pieces + +`FileDataset` is the base for file-backed datasets. A minimal dataset must: + +- define class attributes: `name`, `description`, `homepage` +- implement: + - `category() -> dict` (and/or `categories()` if you want to override) + - `download()` +- optionally override `parse(instance)` for parse-first datasets + +```python +import pathlib +from cpmpy.tools.datasets.core import FileDataset + + +class MyDataset(FileDataset): + name = "mydataset" + description = "A short description of the dataset." + homepage = "https://example.com/mydataset" + + def __init__(self, root=".", transform=None, target_transform=None, download=False, **kwargs): + super().__init__( + dataset_dir=pathlib.Path(root) / self.name, + extension=".txt", + transform=transform, + target_transform=target_transform, + download=download, + **kwargs, + ) + + def parse(self, instance): + # Optional: parse file path to a domain structure + # (for parse-first workflows with parse=True) + return self.read(instance) + + def category(self) -> dict: + # Empty dict if no categories apply + return {} + + def download(self): + # Download/extract instances into self.dataset_dir + raise NotImplementedError +``` + +## Enriched dataset: optional dataset metadata and a field schema + +To make your dataset “self-documenting”, add optional dataset-level attributes +and a `features` schema: + +```python +from cpmpy.tools.datasets.metadata import FeaturesInfo, FieldInfo + + +class MyDataset(FileDataset): + name = "mydataset" + description = "A short description." + homepage = "https://example.com/mydataset" + citation = ["Author et al. My Dataset. 2026."] + + version = "1.0.0" + license = "CC BY 4.0" + domain = "constraint_programming" + tags = ["combinatorial", "satisfaction"] + language = "MyFormat" + + features = FeaturesInfo({ + "num_jobs": ("int", "Number of jobs in the instance"), + "num_machines": ("int", "Number of machines"), + "optimum": FieldInfo("int", "Known optimum (if available)", nullable=True), + }) +``` + +This schema is used for dataset cards and Croissant export; it does not change +how iteration works. If you do not provide `features`, cards and Croissant +still work but omit the domain-field schema section. For **per-field defaults** +and **what you lose** when omitting or simplifying the schema (e.g. empty +descriptions, default nullability), see [Instance Metadata — Level 7: +Declaring a metadata schema](instance_metadata.md#level-7--declaring-a-metadata-schema). + +## Collecting instance metadata + +Override `collect_instance_metadata(file)` to extract domain-specific metadata +once per instance (stored in sidecars by default): + +```python +class MyDataset(FileDataset): + # ... class attrs ... + + def collect_instance_metadata(self, file) -> dict: + # `file` is the file path string by default + return {"num_jobs": 10, "num_machines": 5} +``` + +## Sidecars and advanced kwargs + +`FileDataset` supports two advanced constructor kwargs: + +- `metadata_workers` (default: 1): number of workers used when collecting all + instance metadata after download. +- `ignore_sidecar` (default: False): do not read/write sidecars; instead call + `collect_instance_metadata()` on demand when iterating. + +These kwargs are passed via `**kwargs` and unknown kwargs are ignored to keep +forward compatibility. + +## Authoring loaders and writers (the `open=` convention) + +All IO loaders accept an optional `open=` callable, so callers can control how +files are opened (e.g., decompression). Writers follow the same convention: +they accept an `open=` callable for writing. + +Example: implementing a custom loader that supports compressed files: + +```python +import os +from typing import Union +from io import StringIO + +_std_open = open + +def load_myformat(data: Union[str, os.PathLike], open=open): + if isinstance(data, (str, os.PathLike)) and os.path.exists(data): + f = open(data) if open is not None else _std_open(data, "rt") + else: + f = StringIO(data) + # parse from `f`... +``` + +Example: writing with a custom `open` (compression decided by the caller): + +```python +import lzma +from cpmpy.tools.io.opb import write_opb + +xz_text = lambda path, mode="w": lzma.open(path, "wt") +write_opb(model, "out.opb.xz", open=xz_text) +``` + +## Extending existing datasets (schema inheritance) + +If you subclass an existing dataset and add only a few new metadata fields, +declare only the new fields in `features`. The framework merges parent and child +schemas automatically. + +See `libraries/cpmpy/examples/datasets/05_features_merge.py` for a runnable +example. diff --git a/docs/datasets.md b/docs/datasets.md new file mode 100644 index 000000000..bfabec1ab --- /dev/null +++ b/docs/datasets.md @@ -0,0 +1,283 @@ +--- +title: Datasets +--- + +# Datasets + +CPMpy provides a PyTorch-style dataset interface for working with collections of +benchmark instances. Datasets handle: + +- downloading and local storage +- instance discovery (files) +- per-instance metadata collection (sidecars) +- optional decompression on read +- optional transforms (load, translate, save, etc.) + +The goal is that you can write experiments in a **data-loader style loop**: +each item yields `(x, y)` where `x` is the instance reference and `y` is the +metadata record. + +This page starts with a quickstart, then shows common pipelines, and finally +points to the advanced authoring guides. + +## Quickstart + +### 1) Iterate over instances + +Datasets yield `(file_path, info)` pairs: + +```python +from cpmpy.tools.datasets import JSPLibDataset + +ds = JSPLibDataset(root="./data", download=True) +print(len(ds), "instances") + +for file_path, info in ds: + print(info["name"], info.get("jobs"), info.get("machines")) +``` + +`info` is an `InstanceInfo` (a `dict` subclass) with structured properties. +See the dedicated metadata guide at [](instance_metadata.md). + +### 2) Load each instance into a CPMpy model + +Because datasets are PyTorch-compatible, the most direct pattern is to use the +loader as the dataset transform: + +```python +from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.io import load_xcsp3 + +ds = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) +ds.transform = load_xcsp3 + +for model, info in ds: + if model.solve(): + print(info.id, "objective:", model.objective_value() if model.has_objective() else None) +``` + +If files are compressed, keep decompression support by wrapping the IO loader: +`lambda p: load_xcsp3(p, open=ds.open)`. + +### 3) Add computed fields via `target_transform` + +Use `target_transform` when you want to enrich metadata without modifying your +loop: + +```python +from cpmpy.tools.datasets import JSPLibDataset + +ds = JSPLibDataset( + root="./data", + download=True, + target_transform=lambda info: info | { + "density": info["jobs"] / info["machines"], + "has_optimum": info.get("optimum") is not None, + }, +) + +for _, info in ds: + print(info.id, info["density"], info["has_optimum"]) +``` + +### 4) Parse-first datasets (two-step or compact) + +Some datasets represent domain data, not a fixed CPMpy model. For those +datasets, enable `parse=True` and either model in a second step or pass a model +builder as `transform`. + +```python +from cpmpy.tools.datasets import PSPLibDataset, model_rcpsp + +# Two-step: parse first, model later +ds = PSPLibDataset(variant="rcpsp", family="j60", download=True, parse=True) +for (tasks, capacities), info in ds: + model, (start, end, makespan) = model_rcpsp(tasks, capacities) + model.solve() + +# Compact: parse + model in dataset pipeline +ds = PSPLibDataset( + variant="rcpsp", + family="j60", + download=True, + parse=True, + transform=model_rcpsp, +) +for (model, aux), info in ds: + model.solve() +``` + +## Common pipelines + +### Load → Translate → Save (format conversion) + +Use transform composition: + +```python +from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.datasets.transforms import Compose, Translate, SaveToFile + +ds = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) + +ds.transform = Compose([ + Translate(load_xcsp3, "opb", open=ds.open), # file_path -> OPB string + SaveToFile("./out_opb/", extension=".opb", write_metadata=True), +]) + +for output_path, info in ds: + print("saved", output_path, "id=", info.id) +``` + +When `write_metadata=True`, a `.meta.json` sidecar is written next to each +output file. It contains portable metadata (domain fields, format fields, +model features), but never in-memory objects (see `model_objects` in +[](instance_metadata.md)). + +### Load → Save → Reload from files (generic dataset) + +You can translate a named dataset to a format (e.g. OPB), write instances to +a directory, and later iterate over that directory **without** a dedicated +dataset class for the translated format. Use the `from_files()` helper to build +a generic file-based dataset over any directory: + +```python +from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.datasets.core import from_files +from cpmpy.tools.datasets.transforms import Compose, Translate, SaveToFile +from cpmpy.tools.io import load_opb, load_xcsp3 + +# 1) Load, translate to OPB, write to disk (with metadata sidecars) +ds = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) +ds.transform = Compose([ + Translate(load_xcsp3, "opb", open=ds.open), + SaveToFile("./out_opb/", extension=".opb", write_metadata=True), +]) +for out_path, info in ds: + pass # files written to ./out_opb/ + +# 2) Later: open the same directory as a generic dataset (no XCSP3 class needed) +generic = from_files("./out_opb/", extension=".opb") +generic.transform = load_opb # or lambda p: load_opb(p, open=open) + +for model, info in generic: + print(info["name"], info.get("path")) # minimal metadata; .meta.json can be read separately +``` + +`from_files(dataset_dir, extension)` returns a `FileDataset` that discovers +all files with the given extension under `dataset_dir` (including subdirs). It +does not provide a dataset name, description, or card/Croissant; metadata is +minimal (path, name, id). To reuse the metadata written by `SaveToFile`, read +the `.meta.json` sidecar next to each file (e.g. in a `target_transform`). + +### Using already-downloaded files (custom directory) + +If you have instance files on disk already (e.g. from another source or a +previous run), point the dataset at that directory instead of downloading: + +- **Same layout as the dataset expects:** use the usual class with `root` set + to the parent of the dataset folder, and `download=False`: + + ```python + # JSPLib expects root/jsplib/; your files are in /data/my_jsplib/ + ds = JSPLibDataset(root="/data", download=False) + # Then set dataset_dir to your folder, or use a symlink /data/jsplib -> /data/my_jsplib + ``` + + Concrete dataset classes typically set `dataset_dir = root / self.name` (or + `root / self.name / year / track`). So put your files under that path, or + pass a custom `dataset_dir` when the constructor supports it. + +- **Datasets that accept `dataset_dir`:** e.g. `MaxSATEvalDataset` and others + take an optional `dataset_dir`; if provided, it overrides the default + `root/name/...`: + + ```python + from cpmpy.tools.datasets import MaxSATEvalDataset + + ds = MaxSATEvalDataset( + root="./data", + year=2022, + track="exact-unweighted", + dataset_dir="/path/to/my/wcnf/files", # use this instead of downloading + download=False, + ) + for path, info in ds: + ... + ``` + +- **Arbitrary directory, any extension:** use `from_files(dataset_dir, extension)` + as in the previous section (no download, no dedicated class). + +### Generator-based datasets + +For procedurally generated instances (e.g. random graphs, parameter sweeps), +use `IterableDataset.from_generator()`. You provide a generator function that +yields `(instance_ref, metadata)` pairs and optional keyword arguments; +optionally vary some arguments to get multiple generator runs: + +```python +from cpmpy.tools.datasets.core import IterableDataset + +def my_generator(n, seed): + import random + rng = random.Random(seed) + for i in range(n): + # instance_ref: e.g. dict of parameters or a file path + ref = {"n": n, "seed": seed, "instance_id": i} + meta = {"name": f"gen_{n}_{seed}_{i}"} + yield ref, meta + +# Single run +ds = IterableDataset.from_generator(my_generator, gen_kwargs={"n": 5, "seed": 42}) +for ref, info in ds: + print(info["name"]) + +# Multiple runs: vary "seed" +ds = IterableDataset.from_generator( + my_generator, + gen_kwargs={"n": 5, "seed": [10, 20, 30]}, + vary="seed", +) +# Iteration runs my_generator(n=5, seed=10), then (n=5, seed=20), then (n=5, seed=30) +``` + +Generator datasets do not support `len()` or indexing; they are iterable only. +See the `IterableDataset.from_generator` docstring for `vary` with multiple keys +and `vary_mode="product"` for Cartesian products. + +### Load models and run analytics (solver-style preprocessing) + +If you want to use CPMpy's internal transformation pipeline on loaded models +(like solvers do), see [](transforms_guide.md) for end-to-end examples. + +## Sidecars and metadata collection + +By default, file-based datasets collect instance metadata once and store it +in a `.meta.json` sidecar next to each instance file. Subsequent accesses use +the sidecar and avoid re-computing metadata. + +Advanced constructor kwargs (documented in detail in [](dataset_authoring.md)): + +- `metadata_workers`: parallelism for metadata collection during initial download +- `ignore_sidecar`: bypass sidecar read/write and collect metadata on demand + +## Where to look next + +- [](instance_metadata.md): `InstanceInfo`, structured partitions, `|`, and `without_format()` +- [](reading_and_writing.md): IO loaders/writers and dataset translation workflows +- [](dataset_authoring.md): implementing a new dataset class (best practices + checklists) +- [](transforms_guide.md): custom transforms, `enrich_metadata`, analytics pipelines +- [](benchmarking_workflows.md): dataset-driven experiments and transformation comparisons + +## Runnable examples + +The `libraries/cpmpy/examples/datasets/` directory contains runnable examples +that match the docs: + +- `01_basic_usage.py` +- `02_dataset_card_and_croissant.py` +- `03_target_transforms.py` +- `04_custom_dataset.py` +- `05_features_merge.py` +- `06_benchmark_survey.py` +- `07_metadata_enrichment.py` diff --git a/docs/index.rst b/docs/index.rst index a85f70462..bea0b6ac7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -122,6 +122,12 @@ Different solvers excel at different problems. `Try multiple! dimacs │ 0 │ 7 │ 0 │ 7 │ + │ nurserostering -> mps │ 5 │ 1 │ 0 │ 6 │ + │ nurserostering -> opb │ 3 │ 2 │ 0 │ 5 │ + │ opb DEC-LIN -> dimacs │ 1 │ 22 │ 0 │ 23 │ + │ opb DEC-LIN -> opb │ 5 │ 0 │ 0 │ 5 │ + │ opb OPT-LIN -> dimacs │ 13 │ 21 │ 0 │ 34 │ + │ opb OPT-LIN -> opb │ 14 │ 4 │ 0 │ 18 │ + │ xcsp3 COP -> dimacs │ 28 │ 91 │ 0 │ 119 │ + │ xcsp3 COP -> mps │ 57 │ 7 │ 0 │ 64 │ + │ xcsp3 COP -> opb │ 51 │ 10 │ 0 │ 61 │ + │ xcsp3 CSP -> dimacs │ 10 │ 0 │ 70 │ 80 │ + │ xcsp3 CSP -> mps │ 52 │ 1 │ 0 │ 53 │ + │ xcsp3 CSP -> opb │ 28 │ 16 │ 0 │ 44 │ + └──────────────────────────┴────┴────┴───────┴───────┘ + + + ──────────────────────────────────────── + + + + The 70 Unexpected Failures: pindakaas `_set_option` + + All 70 unexpected failures are xcsp3 CSP -> dimacs and all fail with the same error: + + AttributeError: 'CaDiCaL' object has no attribute '_set_option' + + This is caused by line 118 in libraries/cpmpy/cpmpy/solvers/pindakaas.py: + + libraries/cpmpy/cpmpy/solvers/pindakaas.py lines 115-118 + + self.encoding = "auto" + self.pdk_solver = pdk.solver.CaDiCaL() + # TODO workaround for upstream issue https://github.com/pindakaashq/pindakaas/issues/189 + self.pdk_solver._set_option("factor", 0) + + However, these are stale failures. I verified that the currently installed pindakaas (0.4.1) now supports _set_option -- it works fine. These 70 instances failed during an + earlier run when an older pindakaas build was installed. The COP -> dimacs translations were retried after the fix and no longer show this error, but these 70 CSP -> + dimacs instances were never retried. + If you re-run the CSP -> dimacs translation, these 70 should succeed (unless they hit timeout/memout). + The 70 affected instances span 13 problem families: AverageAvoiding (9), FastMatrixMultiplication (7), Fillomino (6), HyperSudoku (2), MisteryShopper (10), + PoolBallTriangle (4), Soccer (2), SocialGolfers (2), SolitairePattern (5), Subisomorphism (1), Takuzu (8), WordSquare-hak (5), WordSquare-tab (9). + + ──────────────────────────────────────── + + + + Key Observations + + 1. DIMACS is the hardest target format -- it has the most memouts (especially COP: 91 memout vs 7-10 for mps/opb), because CNF encoding explodes the representation size + for integer-domain constraints. + 2. MPS has more timeouts than OPB -- COP: 57 TO for mps vs 51 for opb; CSP: 52 TO for mps vs 28 for opb. This is because the MPS pipeline goes through SCIP's C API + (overhead of constructing a SCIP model object, then serializing), while OPB writes directly to text. + 3. No segfaults, no linearization errors, no other bugs in the current failure set. All "real" failures are resource exhaustion. + + + ⬢ To-do All done + ☒ Gather all failures across all jobs and formats, categorize by root cause + ☒ Identify non-timeout/non-memout failures and analyze their causes + ☒ Present summary statistics + + +--- + + python scripts/translate_all.py --config translate_table_all.yaml --execution-mode himec --out-root ./transform_results --workers 30 --memory-limit-mb 160 +00 --time-limit 1200 --format dimacs --retry-failed \ No newline at end of file diff --git a/docs/reading_and_writing.md b/docs/reading_and_writing.md index a986b1612..c118b1f90 100644 --- a/docs/reading_and_writing.md +++ b/docs/reading_and_writing.md @@ -242,27 +242,29 @@ the dataset as a whole. More info on metadata can be found in ... ### Loading instances into CPMpy models -Use the dataset's `load` as the `transform` argument (PyTorch-style): +Use an IO loader as the `transform` argument (PyTorch-style): ```python from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.io import load_xcsp3 dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True, - transform=dataset.load) + transform=load_xcsp3) for model, info in dataset: model.solve() ``` -Alternatively, call `load` on demand inside the loop: +Alternatively, call an IO loader on demand inside the loop: ```python from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.io import load_xcsp3 dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) for file_path, info in dataset: - model = dataset.load(file_path) + model = load_xcsp3(file_path, open=dataset.open) model.solve() ``` @@ -280,11 +282,12 @@ loading the instance into a CPMpy model, and calling a writer (or the unified ```python from cpmpy.tools.datasets import XCSP3Dataset from cpmpy.tools.io import write +from cpmpy.tools.io import load_xcsp3 dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) for file_path, info in dataset: - model = dataset.load(file_path) + model = load_xcsp3(file_path, open=dataset.open) opb_string = write(model, format="opb") # or write(model, "out.opb") print(info.id, len(opb_string), "bytes") ``` @@ -304,14 +307,14 @@ yourself, or use the `SaveToFile` helper in the pipeline; see ```python from pathlib import Path from cpmpy.tools.datasets import XCSP3Dataset -from cpmpy.tools.io import write +from cpmpy.tools.io import write, load_xcsp3 dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) out_dir = Path("./translated") out_dir.mkdir(parents=True, exist_ok=True) for file_path, info in dataset: - model = dataset.load(file_path) + model = load_xcsp3(file_path, open=dataset.open) out_path = out_dir / f"{info.id.replace('/', '_')}.opb" write(model, str(out_path)) # format inferred from extension print("Saved:", out_path) @@ -337,9 +340,10 @@ Example — load with custom `open` and metadata enrichment: ```python from cpmpy.tools.datasets import XCSP3Dataset from cpmpy.tools.datasets.transforms import Load +from cpmpy.tools.io import load_xcsp3 dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) -dataset.transform = Load(dataset.load, open=dataset.open) +dataset.transform = Load(load_xcsp3, open=dataset.open) for model, info in dataset: # info.model_features, info.model_objects are populated by Load model.solve() @@ -350,9 +354,10 @@ Example — translate to another format on the fly: ```python from cpmpy.tools.datasets import XCSP3Dataset from cpmpy.tools.datasets.transforms import Translate +from cpmpy.tools.io import load_xcsp3 dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) -dataset.transform = Translate(dataset.load, "opb", open=dataset.open) +dataset.transform = Translate(load_xcsp3, "opb", open=dataset.open) for opb_string, info in dataset: print(len(opb_string), "bytes") @@ -371,7 +376,7 @@ from cpmpy.tools.datasets.transforms import Compose, Translate, SaveToFile dataset = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) dataset.transform = Compose([ - Translate(dataset.load, "opb", open=dataset.open), + Translate(load_xcsp3, "opb", open=dataset.open), SaveToFile("./translated/", extension=".opb", write_metadata=True), ]) @@ -386,9 +391,10 @@ Example — load to model, then serialize to string (Compose): ```python from cpmpy.tools.datasets.transforms import Compose, Load, Serialize +from cpmpy.tools.io import load_xcsp3 dataset.transform = Compose([ - Load(dataset.load, open=dataset.open), + Load(load_xcsp3, open=dataset.open), Serialize("opb"), ]) @@ -439,20 +445,21 @@ The four metadata partitions: | `domain_metadata` | Problem-level, format-independent fields (`jobs`, `machines`, `horizon`, …) | ✅ | | `format_metadata` | Format-specific fields (`opb_*`, `wcnf_*`, `mps_*`, `xcsp_*`, `dimacs_*`) | ✅ | | `model_features` | CP model statistics: variable counts, constraint counts, objective info | ✅ | -| `model_objects` | Live CPMpy objects: `variables` map — **only in-memory when the transform returns a CPMpy model (e.g. `dataset.load`, `Load`, `Translate`)** | ❌ | +| `model_objects` | Live CPMpy objects: `variables` map — **only in-memory when the transform returns a CPMpy model (e.g. `load_*`, `Load`, `Translate`)** | ❌ | ### Reading solution values from metadata -Any dataset loader that returns a CPMpy model (including using the dataset's `load` -as the transform) populates `info.model_objects["variables"]` with a +Any transform that returns a CPMpy model (including `cpmpy.tools.io.load_*` +functions used as dataset transforms) populates `info.model_objects["variables"]` with a `{name: CPMpy_variable}` mapping. After solving, you can read values directly from that map without needing a separate reference to the variables: ```python from cpmpy.tools.datasets import JSPLibDataset +from cpmpy.tools.io import load_jsplib dataset = JSPLibDataset(root="./data") -dataset.transform = dataset.load +dataset.transform = load_jsplib for model, info in dataset: if model.solve(): @@ -576,7 +583,7 @@ class TranslateToOPB: dataset = JSPLibDataset(root="./data") -dataset.transform = TranslateToOPB(dataset.load, open=dataset.open) +dataset.transform = TranslateToOPB(load_jsplib, open=dataset.open) for opb_string, info in dataset: print(info["jobs"]) # domain field: carried forward @@ -702,7 +709,6 @@ The `cr:recordSet` describes the shape of each instance (e.g. one row per file); Subclass `FileDataset` and implement four things: ```python -import cpmpy as cp from cpmpy.tools.datasets import FileDataset @@ -723,11 +729,9 @@ class MyDataset(FileDataset): metadata_workers=metadata_workers, ) - @staticmethod - def _loader(content: str) -> cp.Model: - """Parse raw file content and return a CPMpy model.""" - # ... your parsing logic here ... - return cp.Model() + def parse(self, instance): + """Optional parse-first hook for non-model datasets.""" + return self.read(instance) def category(self) -> dict: """Return category labels (e.g. year/track). Empty dict if none.""" @@ -931,7 +935,7 @@ with the output that step produced, so each transform sees its own output: from cpmpy.tools.datasets.transforms import Compose, Load, Serialize dataset.transform = Compose([ - Load(dataset.load, open=dataset.open), # file_path → CPMpy model + Load(load_xcsp3, open=dataset.open), # file_path → CPMpy model Serialize("opb"), # CPMpy model → OPB string ]) diff --git a/docs/transforms_guide.md b/docs/transforms_guide.md new file mode 100644 index 000000000..89d3b3155 --- /dev/null +++ b/docs/transforms_guide.md @@ -0,0 +1,172 @@ +--- +title: Transforms guide +--- + +# Transforms guide + +Datasets support PyTorch-style transforms: + +- `transform`: applied to the instance reference (`x`) during iteration +- `target_transform`: applied to the metadata record (`y`) + +Transforms are the intended way to build **pipelines**: +load → preprocess → analyze → translate → save. + +This guide explains: + +- the transform protocol (`__call__`, optional `enrich_metadata`) +- composition patterns +- metadata enrichment patterns that keep records portable +- using `cpmpy.transformations.*` for solver-style preprocessing and analytics + +## The transform protocol + +Any callable can be used as `dataset.transform`. If it is an object with an +`enrich_metadata(data, metadata)` method, the dataset will call that method +after `__call__` and use its return value as the updated metadata. + +Conceptually: + +```text +file_path -> transform(file_path) -> data +metadata -> target_transform(metadata) -> metadata' +``` + +and optionally: + +```text +metadata -> transform.enrich_metadata(data, metadata) -> metadata' +``` + +## Built-in transforms + +The module `cpmpy.tools.datasets.transforms` provides common building blocks: + +- `Open`: read raw file contents (decompression via an `open=` callable) +- `Load`: parse file content into a CPMpy model; enriches metadata with model features and decision variables +- `Serialize`: serialize a CPMpy model to a target format string +- `Translate`: Load + Serialize in one step +- `SaveToFile`: write content to disk (and optionally `.meta.json` sidecars) +- `Compose`: chain multiple transforms +- `Lambda`: wrap any callable as a named transform + +## Common patterns + +### Load models and solve + +```python +from cpmpy.tools.datasets import XCSP3Dataset +from cpmpy.tools.io import load_xcsp3 + +ds = XCSP3Dataset(root="./data", year=2024, track="CSP", download=True) +ds.transform = load_xcsp3 + +for model, info in ds: + model.solve() +``` + +If you want metadata enrichment (model statistics + variables), use +`Load`: + +```python +from cpmpy.tools.datasets.transforms import Load + +ds.transform = Load(load_xcsp3, open=ds.open) +for model, info in ds: + model.solve() + print(info.model_features) # populated by Load +``` + +### Translate and save + +```python +from cpmpy.tools.datasets.transforms import Compose, Translate, SaveToFile + +ds.transform = Compose([ + Translate(load_xcsp3, "opb", open=ds.open), + SaveToFile("./out_opb/", extension=".opb", write_metadata=True), +]) +``` + +## Metadata enrichment patterns + +### Add computed fields (portable) + +Use `|` to merge fields into the metadata record: + +```python +ds = JSPLibDataset( + root="./data", + target_transform=lambda info: info | { + "density": info["jobs"] / info["machines"], + }, +) +``` + +### Format-changing transforms: drop stale format fields + +When a transform changes the file format, old format-prefixed fields become +misleading. Use `without_format()` to strip format fields and then attach the +new ones: + +```python +from cpmpy.tools.datasets.transforms import extract_format_metadata + +new_info = info.without_format() | extract_format_metadata(opb_string, "opb") +``` + +### Implement `enrich_metadata` in a transform + +If the metadata update depends on the transform output, implement +`enrich_metadata(data, metadata)`: + +```python +from cpmpy.tools.datasets.transforms import Translate, extract_format_metadata + +class TranslateToOPB: + def __init__(self, loader, open): + self._translate = Translate(loader, "opb", open=open) + + def __call__(self, file_path): + return self._translate(file_path) # OPB string + + def enrich_metadata(self, data, metadata): + return metadata.without_format() | extract_format_metadata(data, "opb") +``` + +## Solver-style preprocessing and analytics + +CPMpy has an internal transformation toolbox under `cpmpy.transformations`. +Solvers use these transformations to rewrite high-level constraints into +supported low-level forms. + +You can use the same transformations for analytics: + +- how many constraints/variables are introduced by a decomposition? +- how long does a preprocessing pipeline take? +- does a decomposition improve solve time on a dataset subset? + +### A minimal “preprocess then measure” pattern + +```python +import time +from cpmpy.transformations.flatten_model import flatten_model +from cpmpy.transformations.get_variables import get_variables_model + +t0 = time.perf_counter() +flat = flatten_model(model) +dt = time.perf_counter() - t0 + +num_vars = len(get_variables_model(flat)) +num_cons = len(flat.constraints) +print("flatten:", dt, "vars:", num_vars, "cons:", num_cons) +``` + +### Decomposing a specific global constraint + +The `decompose_global` transformation can decompose unsupported globals. You can +also provide custom decompositions to compare strategies in an experiment. + +For a full runnable example that compares two strategies across a dataset +subset, see [](benchmarking_workflows.md) and the example script +`libraries/cpmpy/examples/datasets/08_transformation_benchmark.py`. From cf221b5d789a15aa5c7d4a8f01c328cd61d96ba6 Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Wed, 11 Mar 2026 11:40:33 +0100 Subject: [PATCH 150/152] Add missing wcnf parts --- cpmpy/tools/io/writer.py | 3 ++- docs/api/tools/writers.rst | 8 +++++++- docs/reading_and_writing.md | 6 +++--- setup.py | 2 +- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/cpmpy/tools/io/writer.py b/cpmpy/tools/io/writer.py index f6bd4ddd8..1db192be0 100644 --- a/cpmpy/tools/io/writer.py +++ b/cpmpy/tools/io/writer.py @@ -43,7 +43,7 @@ # "zpl": partial(write_scip, format="zpl"), # requires SIMPL, not included in pip package "dimacs": write_dimacs, "opb": write_opb, - # "wcnf": write_wcnf, # currently not supported + "wcnf": write_dimacs, } # Maps each format to the external packages its writer depends on. @@ -56,6 +56,7 @@ "gms": ["pyscipopt"], "pip": ["pyscipopt"], "dimacs": ["pindakaas"], + "wcnf": ["pindakaas"], "opb": [], } diff --git a/docs/api/tools/writers.rst b/docs/api/tools/writers.rst index 62d7f0071..a990144de 100644 --- a/docs/api/tools/writers.rst +++ b/docs/api/tools/writers.rst @@ -20,6 +20,7 @@ A unified ``write()`` function auto-detects the format from the file extension: model = cp.Model([x + y <= 5], minimize=x + y) write(model, "output.opb") # format from extension + write(model, "output.wcnf") write(model, "output.mps") write(model, "output.cnf") @@ -47,6 +48,10 @@ Supported Formats - ``.cnf`` - :func:`write_dimacs ` - — + * - WCNF + - ``.wcnf`` + - :func:`write_dimacs ` + - — * - MPS - ``.mps`` - :func:`write_scip ` @@ -91,6 +96,7 @@ given, making them suitable for use inside dataset transforms. write_dimacs(model, "output.cnf") cnf_string = write_dimacs(model) + write_dimacs(model, "output.wcnf") Compressed output ----------------- @@ -156,7 +162,7 @@ Listing Available Formats from cpmpy.tools.io import write_formats print(write_formats()) - # ['mps', 'lp', 'cip', 'fzn', 'gms', 'pip', 'dimacs', 'opb'] + # ['mps', 'lp', 'cip', 'fzn', 'gms', 'pip', 'dimacs', 'opb', 'wcnf'] Converting Between Formats --------------------------- diff --git a/docs/reading_and_writing.md b/docs/reading_and_writing.md index c118b1f90..5e6801357 100644 --- a/docs/reading_and_writing.md +++ b/docs/reading_and_writing.md @@ -24,7 +24,7 @@ to instructions on how to write your own dataset class. | Format | Extension | Load | Write | Domain | |--------|-----------|------|-------|--------| | **OPB** | `.opb` | ✅ | ✅ | Pseudo-Boolean optimization | -| **WCNF** | `.wcnf` | ✅ | — | MaxSAT | +| **WCNF** | `.wcnf` | ✅ | ✅ | MaxSAT | | **DIMACS** | `.cnf` | ✅ | ✅ | SAT | | **MPS** | `.mps` | ✅ | ✅ | Mixed integer programming | | **LP** | `.lp` | ✅ | ✅ | Linear/integer programming | @@ -329,7 +329,7 @@ that you can assign to `dataset.transform` (or use inside `Compose`): |--------|---------| | **`Load`** | Load a file path into a CPMpy model. Accepts a custom `open` callable (e.g. for compressed files) and implements `enrich_metadata` to add `model_features` and `model_objects` to the instance metadata. | | **`Open`** | Open a file path and return its raw text contents (with optional custom `open` for decompression). No parsing. | -| **`Serialize`** | Turn a CPMpy model into a string in a given format (e.g. `"opb"`, `"dimacs"`, `"mps"` or a writer function). | +| **`Serialize`** | Turn a CPMpy model into a string in a given format (e.g. `"opb"`, `"wcnf"`, `"dimacs"`, `"mps"` or a writer function). | | **`Translate`** | Load from one format and serialize to another in one step (e.g. XCSP3 → OPB). Uses a custom `open` for reading and enriches metadata from the intermediate model. | | **`SaveToFile`** | Write the transform output (e.g. a string) to a file under a given directory; optional `.meta.json` sidecar. | | **`Compose`** | Chain several transforms; each step's output is passed to the next, and each step's `enrich_metadata` (if present) is called with its own output. | @@ -363,7 +363,7 @@ for opb_string, info in dataset: print(len(opb_string), "bytes") ``` -`Translate` accepts a format name string (`"opb"`, `"dimacs"`, `"mps"`, …) or a +`Translate` accepts a format name string (`"opb"`, `"wcnf"`, `"dimacs"`, `"mps"`, …) or a writer function directly. Under the hood it loads the instance into a CPMpy model and serializes it to the target format. diff --git a/setup.py b/setup.py index 0d99f7772..96787a4af 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ def get_version(rel_path): "io.scip": ["pyscipopt"], "io.dimacs": solver_dependencies["pindakaas"], # Required for write_dimacs (uses to_cnf transformation) "io.opb": [], # No external dependencies - "io.wcnf": [], # No external dependencies + "io.wcnf": solver_dependencies["pindakaas"], # Required for write(..., format="wcnf") via DIMACS writer path "io.xcsp3": ["pycsp3"], } format_dependencies["io.all"] = list({pkg for group in format_dependencies.values() for pkg in group}) From 43746f8914f41897fe8a69cf865d19f823005c4b Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 3 Apr 2026 16:43:55 +0200 Subject: [PATCH 151/152] Remove import of not yet added dataset --- cpmpy/tools/datasets/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpmpy/tools/datasets/__init__.py b/cpmpy/tools/datasets/__init__.py index 943c4f663..5819b2fc6 100644 --- a/cpmpy/tools/datasets/__init__.py +++ b/cpmpy/tools/datasets/__init__.py @@ -37,7 +37,6 @@ "OPBDataset", "MaxSATEvalDataset", "SATDataset", - "DIMACSCliqueDataset", # Parse/model helpers for parse-first datasets "parse_jsp", "model_jobshop", @@ -64,7 +63,6 @@ from .opb import OPBDataset from .mse import MaxSATEvalDataset from .sat import SATDataset -from .dimacs_clique import DIMACSCliqueDataset from .transforms import Compose, Open, Load, Serialize, Translate, SaveToFile, Lambda, extract_format_metadata # Backward compatibility alias Parse = Load From 9369103005682b147f0a7ef397a4e7f388c37adb Mon Sep 17 00:00:00 2001 From: ThomSerg Date: Fri, 3 Apr 2026 20:46:28 +0200 Subject: [PATCH 152/152] Disable model features temporarily --- cpmpy/tools/datasets/core.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpmpy/tools/datasets/core.py b/cpmpy/tools/datasets/core.py index 5af317e09..aabdd659c 100644 --- a/cpmpy/tools/datasets/core.py +++ b/cpmpy/tools/datasets/core.py @@ -1013,16 +1013,16 @@ def _collect_one_metadata(self, file_path): except (json.JSONDecodeError, IOError): pass - if model_features is None: - if not callable(self.reader): - raise TypeError( - f"Cannot extract model features for {file_path}: " - "no dataset reader configured. If unexpected, please open an issue on GitHub." - ) - model = self.reader(str(file_path), open=self.open) - model_features = extract_model_features(model) + # if model_features is None: + # if not callable(self.reader): + # raise TypeError( + # f"Cannot extract model features for {file_path}: " + # "no dataset reader configured. If unexpected, please open an issue on GitHub." + # ) + # model = self.reader(str(file_path), open=self.open) + # model_features = extract_model_features(model) - sidecar["model_features"] = model_features + # sidecar["model_features"] = model_features with open(meta_path, "w") as f: json.dump(sidecar, f, indent=2)