From e5ba9844a1651dcc0f989e8c2853097ea966e983 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Mon, 23 Feb 2026 23:02:39 +0530 Subject: [PATCH 1/4] [DOC] Add usage examples to core function docstrings (#1538) --- openml/datasets/functions.py | 12 ++++++ openml/runs/functions.py | 78 ++++++++++++++++++++++++++---------- openml/study/functions.py | 25 +++++++++++- openml/tasks/functions.py | 12 +++++- 4 files changed, 103 insertions(+), 24 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 3ac657ea0..c2c59683b 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -364,6 +364,11 @@ def get_datasets( ------- datasets : list of datasets A list of dataset objects. + + Examples + -------- + >>> import openml + >>> datasets = openml.datasets.get_datasets([1, 2, 3]) # doctest: +SKIP """ datasets = [] for dataset_id in dataset_ids: @@ -446,6 +451,13 @@ def get_dataset( # noqa: C901, PLR0912 ------- dataset : :class:`openml.OpenMLDataset` The downloaded dataset. + + Examples + -------- + >>> import openml + >>> dataset = openml.datasets.get_dataset(1) # doctest: +SKIP + >>> dataset = openml.datasets.get_dataset("iris", version=1) # doctest: +SKIP + >>> dataset = openml.datasets.get_dataset(1, download_data=True) # doctest: +SKIP """ if download_all_files: warnings.warn( diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 503788dbd..ffb468c69 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -104,6 +104,15 @@ def run_model_on_task( # noqa: PLR0913 Result of the run. flow : OpenMLFlow (optional, only if `return_flow` is True). Flow generated from the model. + + Examples + -------- + >>> import openml + >>> import openml_sklearn # doctest: +SKIP + >>> from sklearn.tree import DecisionTreeClassifier # doctest: +SKIP + >>> clf = DecisionTreeClassifier() # doctest: +SKIP + >>> task = openml.tasks.get_task(1) # doctest: +SKIP + >>> run = openml.runs.run_model_on_task(clf, task) # doctest: +SKIP """ if avoid_duplicate_runs is None: avoid_duplicate_runs = openml.config.avoid_duplicate_runs @@ -273,9 +282,7 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 setup_id = setup_exists(flow_from_server) ids = run_exists(task.task_id, setup_id) if ids: - error_message = ( - "One or more runs of this setup were already performed on the task." - ) + error_message = "One or more runs of this setup were already performed on the task." raise OpenMLRunsExistError(ids, error_message) else: # Flow does not exist on server and we do not want to upload it. @@ -505,11 +512,15 @@ def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, C901 # this information is multiple times overwritten, but due to the ordering # of tne loops, eventually it contains the information based on the full # dataset size - user_defined_measures_per_fold = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' + user_defined_measures_per_fold = ( + OrderedDict() + ) # type: 'OrderedDict[str, OrderedDict]' # stores sample-based evaluation measures (sublevel of fold-based) # will also be filled on a non sample-based task, but the information # is the same as the fold-based measures, and disregarded in that case - user_defined_measures_per_sample = OrderedDict() # type: 'OrderedDict[str, OrderedDict]' + user_defined_measures_per_sample = ( + OrderedDict() + ) # type: 'OrderedDict[str, OrderedDict]' # TODO use different iterator to only provide a single iterator (less # methods, less maintenance, less confusion) @@ -557,9 +568,14 @@ def _run_task_get_arffcontent( # noqa: PLR0915, PLR0912, C901 ) # job_rvals contain the output of all the runs with one-to-one correspondence with `jobs` for n_fit, rep_no, fold_no, sample_no in jobs: - pred_y, proba_y, test_indices, test_y, inner_trace, user_defined_measures_fold = job_rvals[ - n_fit - 1 - ] + ( + pred_y, + proba_y, + test_indices, + test_y, + inner_trace, + user_defined_measures_fold, + ) = job_rvals[n_fit - 1] if inner_trace is not None: traces.append(inner_trace) @@ -598,7 +614,11 @@ def _calculate_local_measure( # type: ignore if isinstance(test_y[i], (int, np.integer)) else test_y[i] ) - pred_prob = proba_y.iloc[i] if isinstance(proba_y, pd.DataFrame) else proba_y[i] + pred_prob = ( + proba_y.iloc[i] + if isinstance(proba_y, pd.DataFrame) + else proba_y[i] + ) arff_line = format_prediction( task=task, @@ -661,11 +681,13 @@ def _calculate_local_measure( # type: ignore if rep_no not in user_defined_measures_per_sample[measure]: user_defined_measures_per_sample[measure][rep_no] = OrderedDict() if fold_no not in user_defined_measures_per_sample[measure][rep_no]: - user_defined_measures_per_sample[measure][rep_no][fold_no] = OrderedDict() + user_defined_measures_per_sample[measure][rep_no][ + fold_no + ] = OrderedDict() - user_defined_measures_per_fold[measure][rep_no][fold_no] = user_defined_measures_fold[ - measure - ] + user_defined_measures_per_fold[measure][rep_no][fold_no] = ( + user_defined_measures_fold[measure] + ) user_defined_measures_per_sample[measure][rep_no][fold_no][sample_no] = ( user_defined_measures_fold[measure] ) @@ -821,7 +843,9 @@ def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: # noqa: FBT0 run : OpenMLRun Run corresponding to ID, fetched from the server. """ - run_dir = Path(openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id)) + run_dir = Path( + openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id) + ) run_file = run_dir / "description.xml" run_dir.mkdir(parents=True, exist_ok=True) @@ -840,7 +864,9 @@ def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: # noqa: FBT0 return _create_run_from_xml(run_xml) -def _create_run_from_xml(xml: str, from_server: bool = True) -> OpenMLRun: # noqa: PLR0915, PLR0912, C901, FBT002 +def _create_run_from_xml( + xml: str, from_server: bool = True +) -> OpenMLRun: # noqa: PLR0915, PLR0912, C901, FBT002 """Create a run object from xml returned from server. Parameters @@ -870,11 +896,13 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore if not from_server: return None - raise AttributeError("Run XML does not contain required (server) field: ", fieldname) + raise AttributeError( + "Run XML does not contain required (server) field: ", fieldname + ) - run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[ - "oml:run" - ] + run = xmltodict.parse( + xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"] + )["oml:run"] run_id = obtain_field(run, "oml:run_id", from_server, cast=int) uploader = obtain_field(run, "oml:uploader", from_server, cast=int) uploader_name = obtain_field(run, "oml:uploader_name", from_server) @@ -1029,7 +1057,9 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore def _get_cached_run(run_id: int) -> OpenMLRun: """Load a run from the cache.""" - run_cache_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id) + run_cache_dir = openml.utils._create_cache_directory_for_id( + RUNS_CACHE_DIR_NAME, run_id + ) run_file = run_cache_dir / "description.xml" try: with run_file.open(encoding="utf8") as fh: @@ -1199,7 +1229,9 @@ def __list_runs(api_call: str) -> pd.DataFrame: runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",)) # Minimalistic check if the XML is useful if "oml:runs" not in runs_dict: - raise ValueError(f'Error in return XML, does not contain "oml:runs": {runs_dict}') + raise ValueError( + f'Error in return XML, does not contain "oml:runs": {runs_dict}' + ) if "@xmlns:oml" not in runs_dict["oml:runs"]: raise ValueError( @@ -1213,7 +1245,9 @@ def __list_runs(api_call: str) -> pd.DataFrame: f'"http://openml.org/openml": {runs_dict}', ) - assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type(runs_dict["oml:runs"]) + assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type( + runs_dict["oml:runs"] + ) runs = { int(r["oml:run_id"]): { diff --git a/openml/study/functions.py b/openml/study/functions.py index bb24ddcff..24f1d8f7f 100644 --- a/openml/study/functions.py +++ b/openml/study/functions.py @@ -30,6 +30,12 @@ def get_suite(suite_id: int | str) -> OpenMLBenchmarkSuite: ------- OpenMLSuite The OpenML suite object + + Examples + -------- + >>> import openml + >>> suite = openml.study.get_suite(99) # doctest: +SKIP + >>> suite = openml.study.get_suite("OpenML-CC18") # doctest: +SKIP """ study = _get_study(suite_id, entity_type="task") assert isinstance(study, OpenMLBenchmarkSuite) @@ -59,6 +65,11 @@ def get_study( ------- OpenMLStudy The OpenML study object + + Examples + -------- + >>> import openml + >>> study = openml.study.get_study(1) # doctest: +SKIP """ if study_id == "OpenML100": message = ( @@ -109,7 +120,10 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy: tags = [] if "oml:tag" in result_dict: for tag in result_dict["oml:tag"]: - current_tag = {"name": tag["oml:name"], "write_access": tag["oml:write_access"]} + current_tag = { + "name": tag["oml:name"], + "write_access": tag["oml:write_access"], + } if "oml:window_start" in tag: current_tag["window_start"] = tag["oml:window_start"] tags.append(current_tag) @@ -210,6 +224,15 @@ def create_study( ------- OpenMLStudy A local OpenML study object (call publish method to upload to server) + + Examples + -------- + >>> import openml + >>> study = openml.study.create_study( # doctest: +SKIP + ... name="My Study", + ... description="A study on classification tasks", + ... run_ids=[1, 2, 3], + ... ) """ return OpenMLStudy( study_id=None, diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py index 3fbc7adee..cb1e9295b 100644 --- a/openml/tasks/functions.py +++ b/openml/tasks/functions.py @@ -380,7 +380,11 @@ def get_tasks( tasks = [] for task_id in task_ids: tasks.append( - get_task(task_id, download_data=download_data, download_qualities=download_qualities) + get_task( + task_id, + download_data=download_data, + download_qualities=download_qualities, + ) ) return tasks @@ -411,6 +415,12 @@ def get_task( Returns ------- task: OpenMLTask + + Examples + -------- + >>> import openml + >>> task = openml.tasks.get_task(1) # doctest: +SKIP + >>> task = openml.tasks.get_task(1, download_splits=True) # doctest: +SKIP """ if not isinstance(task_id, int): raise TypeError(f"Task id should be integer, is {type(task_id)}") From 2e455d191b05db84ec37a0336fee850a74b328d0 Mon Sep 17 00:00:00 2001 From: Abhishek Date: Mon, 23 Feb 2026 23:14:14 +0530 Subject: [PATCH 2/4] [DOC] Add usage examples to core function docstrings (#1538) --- .DS_Store | Bin 0 -> 8196 bytes openml/runs/functions.py | 61 +++++++----------- .../misc/features_with_whitespaces.xml.pkl | Bin 0 -> 253 bytes 3 files changed, 22 insertions(+), 39 deletions(-) create mode 100644 .DS_Store create mode 100644 tests/files/misc/features_with_whitespaces.xml.pkl diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..391ba45554c973056274a3e8b4a31021f12627d5 GIT binary patch literal 8196 zcmeI1O=uHA6vyAxGzrAof>3%~{J4l8E!ta!G^QY;f)qSz+a#vNq#K$P>_NJM3Pl7D zUZoci)E+ztp7bhs61;fvq*tM!C%yQ8JJZdZWH;$aq|TI?x6FGp?>BE|XEIwNQiav> z1ko4~WszBq~Hb>E@{Sj9uQyKug`6l(~E;<5`&-iVD4>e=jsAmDkdhE&?tB-4Wosdx}m{m6oVw z-QVx8oszn`Uo0-v`~}oyzikI^%Fo_U+xl`(ePdC48`xmCl-co1i?m7$mUo&w&}|_Z zHXc0Rn$`>*{nY(7NwSs2Ljm*Tz*DC+Y9L>y63rtoQbls9<1Zg4TS~s$cds73p!pK| zm7M*pC9j&EW8NX~GHZ(})I^!UOmy!`Z)?e&+qyO|qq%$ZFWGn3Lh=)RWPXmz3TFNW z{u0^KYsr?9duTl~UTMcWreDciJpJXcn4K7Fn>WwAX)2SCHE|R1UXiPU=t#O#FWE-l zMQhdSLlPy~#BihxN7H7QKM#gTC;cD#9Qhae_FEFCxBWii@n>Qt zK280+q}P>YV{r_>Vvb?Sfwf)1u3DuN*jbmc!hMU3CXe-RAz8mZd3dg%Ir{jK^t*3m zjYsc?8SE-0yrD~oZmVOqDYFj#4hzZq`bRX^T~_UxPPVdnJUFAlXs5GR-sXqPwpMbC z9z1i$#7(r9F^ada{68IG|9dU%(*qjg`~OVx_y4^%Y&RYk0T%%g0hKDw77M5_cWa^2 zk9^nmkmry&v0b2L*aQWeaD8|khm8JVh-c5}@ OpenMLRun: # noqa: FBT0 run : OpenMLRun Run corresponding to ID, fetched from the server. """ - run_dir = Path( - openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id) - ) + run_dir = Path(openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id)) run_file = run_dir / "description.xml" run_dir.mkdir(parents=True, exist_ok=True) @@ -864,9 +854,10 @@ def get_run(run_id: int, ignore_cache: bool = False) -> OpenMLRun: # noqa: FBT0 return _create_run_from_xml(run_xml) -def _create_run_from_xml( - xml: str, from_server: bool = True -) -> OpenMLRun: # noqa: PLR0915, PLR0912, C901, FBT002 +def _create_run_from_xml( # noqa: PLR0915, PLR0912, C901 + xml: str, + from_server: bool = True, # noqa: FBT002 +) -> OpenMLRun: """Create a run object from xml returned from server. Parameters @@ -896,13 +887,11 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore if not from_server: return None - raise AttributeError( - "Run XML does not contain required (server) field: ", fieldname - ) + raise AttributeError("Run XML does not contain required (server) field: ", fieldname) - run = xmltodict.parse( - xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"] - )["oml:run"] + run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[ + "oml:run" + ] run_id = obtain_field(run, "oml:run_id", from_server, cast=int) uploader = obtain_field(run, "oml:uploader", from_server, cast=int) uploader_name = obtain_field(run, "oml:uploader_name", from_server) @@ -1057,9 +1046,7 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None): # type: ignore def _get_cached_run(run_id: int) -> OpenMLRun: """Load a run from the cache.""" - run_cache_dir = openml.utils._create_cache_directory_for_id( - RUNS_CACHE_DIR_NAME, run_id - ) + run_cache_dir = openml.utils._create_cache_directory_for_id(RUNS_CACHE_DIR_NAME, run_id) run_file = run_cache_dir / "description.xml" try: with run_file.open(encoding="utf8") as fh: @@ -1229,9 +1216,7 @@ def __list_runs(api_call: str) -> pd.DataFrame: runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",)) # Minimalistic check if the XML is useful if "oml:runs" not in runs_dict: - raise ValueError( - f'Error in return XML, does not contain "oml:runs": {runs_dict}' - ) + raise ValueError(f'Error in return XML, does not contain "oml:runs": {runs_dict}') if "@xmlns:oml" not in runs_dict["oml:runs"]: raise ValueError( @@ -1245,9 +1230,7 @@ def __list_runs(api_call: str) -> pd.DataFrame: f'"http://openml.org/openml": {runs_dict}', ) - assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type( - runs_dict["oml:runs"] - ) + assert isinstance(runs_dict["oml:runs"]["oml:run"], list), type(runs_dict["oml:runs"]) runs = { int(r["oml:run_id"]): { diff --git a/tests/files/misc/features_with_whitespaces.xml.pkl b/tests/files/misc/features_with_whitespaces.xml.pkl new file mode 100644 index 0000000000000000000000000000000000000000..f6a775cc7e9ab9d0db09aa559f3c6b4495859abf GIT binary patch literal 253 zcmZo*nfi$V0&1sdcr*0Ku(6B$gx=rlX;DVmK_fSP+)GxJhXE2aPqVaZF(1uA9=Gn~@H2{I|Zq_O}gz@Ar{n_85aJf(*( zFF!XkFEJ;+EHS4vwRnnOk0?+sDYYm*H?z1nGcO$`0W^&(Kd&S|CqF$Cq^>l{n=ykK zVq*pykj>u19A;uPC4&=aI>gK=8Qifz=Wr_MDwrAof!>rJ4hTzIZ%S$gk2gaGFVLJ) FJpk*eSy=!8 literal 0 HcmV?d00001 From c1557096a70df68ba0224e9a403a555c754b539a Mon Sep 17 00:00:00 2001 From: Abhishek Date: Sun, 1 Mar 2026 22:23:06 +0530 Subject: [PATCH 3/4] Address review: fix run_model_on_task example, remove unwanted files - Use HistGradientBoostingClassifier instead of DecisionTreeClassifier (fixes ValueError with missing values in task 1) - Remove .DS_Store - Remove tests/files/misc/features_with_whitespaces.xml.pkl --- .DS_Store | Bin 8196 -> 0 bytes openml/runs/functions.py | 4 ++-- .../files/misc/features_with_whitespaces.xml.pkl | Bin 253 -> 0 bytes 3 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 .DS_Store delete mode 100644 tests/files/misc/features_with_whitespaces.xml.pkl diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 391ba45554c973056274a3e8b4a31021f12627d5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeI1O=uHA6vyAxGzrAof>3%~{J4l8E!ta!G^QY;f)qSz+a#vNq#K$P>_NJM3Pl7D zUZoci)E+ztp7bhs61;fvq*tM!C%yQ8JJZdZWH;$aq|TI?x6FGp?>BE|XEIwNQiav> z1ko4~WszBq~Hb>E@{Sj9uQyKug`6l(~E;<5`&-iVD4>e=jsAmDkdhE&?tB-4Wosdx}m{m6oVw z-QVx8oszn`Uo0-v`~}oyzikI^%Fo_U+xl`(ePdC48`xmCl-co1i?m7$mUo&w&}|_Z zHXc0Rn$`>*{nY(7NwSs2Ljm*Tz*DC+Y9L>y63rtoQbls9<1Zg4TS~s$cds73p!pK| zm7M*pC9j&EW8NX~GHZ(})I^!UOmy!`Z)?e&+qyO|qq%$ZFWGn3Lh=)RWPXmz3TFNW z{u0^KYsr?9duTl~UTMcWreDciJpJXcn4K7Fn>WwAX)2SCHE|R1UXiPU=t#O#FWE-l zMQhdSLlPy~#BihxN7H7QKM#gTC;cD#9Qhae_FEFCxBWii@n>Qt zK280+q}P>YV{r_>Vvb?Sfwf)1u3DuN*jbmc!hMU3CXe-RAz8mZd3dg%Ir{jK^t*3m zjYsc?8SE-0yrD~oZmVOqDYFj#4hzZq`bRX^T~_UxPPVdnJUFAlXs5GR-sXqPwpMbC z9z1i$#7(r9F^ada{68IG|9dU%(*qjg`~OVx_y4^%Y&RYk0T%%g0hKDw77M5_cWa^2 zk9^nmkmry&v0b2L*aQWeaD8|khm8JVh-c5}@>> import openml >>> import openml_sklearn # doctest: +SKIP - >>> from sklearn.tree import DecisionTreeClassifier # doctest: +SKIP - >>> clf = DecisionTreeClassifier() # doctest: +SKIP + >>> from sklearn.ensemble import HistGradientBoostingClassifier # doctest: +SKIP + >>> clf = HistGradientBoostingClassifier() # doctest: +SKIP >>> task = openml.tasks.get_task(1) # doctest: +SKIP >>> run = openml.runs.run_model_on_task(clf, task) # doctest: +SKIP """ diff --git a/tests/files/misc/features_with_whitespaces.xml.pkl b/tests/files/misc/features_with_whitespaces.xml.pkl deleted file mode 100644 index f6a775cc7e9ab9d0db09aa559f3c6b4495859abf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 253 zcmZo*nfi$V0&1sdcr*0Ku(6B$gx=rlX;DVmK_fSP+)GxJhXE2aPqVaZF(1uA9=Gn~@H2{I|Zq_O}gz@Ar{n_85aJf(*( zFF!XkFEJ;+EHS4vwRnnOk0?+sDYYm*H?z1nGcO$`0W^&(Kd&S|CqF$Cq^>l{n=ykK zVq*pykj>u19A;uPC4&=aI>gK=8Qifz=Wr_MDwrAof!>rJ4hTzIZ%S$gk2gaGFVLJ) FJpk*eSy=!8 From 49bd8db00c023226dd3e58c91f36727f151b176c Mon Sep 17 00:00:00 2001 From: Abhishek Date: Mon, 2 Mar 2026 02:03:48 +0530 Subject: [PATCH 4/4] Use get_task(6) with DecisionTreeClassifier per review feedback --- openml/runs/functions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index d9ab00d33..dc81f9987 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -109,9 +109,9 @@ def run_model_on_task( # noqa: PLR0913 -------- >>> import openml >>> import openml_sklearn # doctest: +SKIP - >>> from sklearn.ensemble import HistGradientBoostingClassifier # doctest: +SKIP - >>> clf = HistGradientBoostingClassifier() # doctest: +SKIP - >>> task = openml.tasks.get_task(1) # doctest: +SKIP + >>> from sklearn.tree import DecisionTreeClassifier # doctest: +SKIP + >>> clf = DecisionTreeClassifier() # doctest: +SKIP + >>> task = openml.tasks.get_task(6) # doctest: +SKIP >>> run = openml.runs.run_model_on_task(clf, task) # doctest: +SKIP """ if avoid_duplicate_runs is None: