diff --git a/tutorials/advanced/checkpoint.ipynb b/tutorials/advanced/checkpoint.ipynb
index a0495eb7..8779bc0f 100644
--- a/tutorials/advanced/checkpoint.ipynb
+++ b/tutorials/advanced/checkpoint.ipynb
@@ -52,12 +52,12 @@
"\n",
"Lazy | Filesystem | Permanent | Deterministic | Explicit | Spark Equivalent | Fugue Equivalent\n",
":---:|:---:|:---:|:---:|:---:|:---|:---\n",
- "Yes | No | No | No | No | `persist()` | `weak_checkpoint(lazy=True)`\n",
- "No | No | No | No | No | | `persist()` or `weak_checkpoint(lazy=False)`\n",
- "Yes | Yes | No | No | No | | `strong_checkpoint(lazy=True)`\n",
+ "Yes | No | No | No | No | `persist()` | `persist(lazy=True)`\n",
+ "No | No | No | No | No | | `persist()` or `persist(lazy=False)`\n",
+ "Yes | Yes | No | No | No | | `transform(..., save_path=None, checkpoint=True)`\n",
"No | Yes | No | No | No | | `checkpoint()` or `strong_checkpoint(lazy=False)`\n",
"Yes | Yes | Yes | No | No | | \n",
- "No | Yes | Yes | No | No | `checkpoint` | `yield_as`\n",
+ "No | Yes | Yes | No | No | `checkpoint()` | `yield_as`\n",
"Yes | Yes | Yes | Yes | No | | `deteriministic_checkpoint(lazy=True)`\n",
"No | Yes | Yes | Yes | No | | `deteriministic_checkpoint()`\n",
"Yes | Yes | Yes | Yes | Yes | | \n",
@@ -70,26 +70,26 @@
"\n",
"### Weak Checkpoint\n",
"\n",
- "Weak checkpoint is in memory checkpoint. We don't use 'in memory' because it may also use executor's local disk space, depending on the execution engine we use. `persist()` is an alias of `weak_checkpoint(lazy=False)`. \n",
+ "Weak checkpoint is an in memory checkpoint. We don't use 'in memory' because it may also use executor's local disk space, depending on the execution engine we use. To invoke this, use `persist(lazy=False)`. \n",
"\n",
- "Note Fugue persist is eager but Spark persist is lazy."
+ "Note Fugue persist is eager by default but Spark persist is lazy."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
- "from fugue import FugueWorkflow\n",
- "from fugue_spark import SparkExecutionEngine\n",
"from time import sleep\n",
"import pandas as pd\n",
+ "import fugue.api as fa\n",
"\n",
- "# schema: *\n",
"def just_wait(df:pd.DataFrame) -> pd.DataFrame:\n",
" sleep(5)\n",
- " return df"
+ " return df\n",
+ "\n",
+ "df = pd.DataFrame({\"a\": [1,2,3], \"b\": [1,2,3]})"
]
},
{
@@ -101,37 +101,141 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
- "%%time\n",
- "with FugueWorkflow(SparkExecutionEngine()) as dag:\n",
- " df = dag.df([[0]],\"a:int\")\n",
- " df = df.transform(just_wait)\n",
- " df.show()\n",
- " df.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In the following code, the two shows, due to Spark lineage, will duplicae the transform step, and will take 10 sec"
+ "from pyspark.sql import SparkSession\n",
+ "\n",
+ "spark = SparkSession.builder.getOrCreate()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a:long | \n",
+ " b:long | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "SparkDataFrame: a:long,b:long"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " a:long | \n",
+ " b:long | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "SparkDataFrame: a:long,b:long"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 49.5 ms, sys: 13.7 ms, total: 63.2 ms\n",
+ "Wall time: 10.6 s\n"
+ ]
+ }
+ ],
"source": [
"%%time\n",
- "with FugueWorkflow(SparkExecutionEngine()) as dag:\n",
- " df = dag.df([[0]],\"a:int\")\n",
- " df = df.transform(just_wait).persist()\n",
- " df.show()\n",
- " df.show()"
+ "with fa.engine_context(spark):\n",
+ " res = fa.transform(df, just_wait, schema=\"*\")\n",
+ " fa.show(res)\n",
+ " fa.show(res)"
]
},
{
@@ -143,35 +247,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "CPU times: user 26.4 ms, sys: 6.85 ms, total: 33.2 ms\n",
+ "Wall time: 83.9 ms\n"
+ ]
+ }
+ ],
"source": [
"%%time\n",
- "with FugueWorkflow(SparkExecutionEngine()) as dag:\n",
- " df = dag.df([[0]],\"a:int\")\n",
- " df = df.transform(just_wait).weak_checkpoint(lazy=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The following takes 5 sec because the first show triggers the persist and the second uses the cache. But it is pointless to use a lazy checkpoint in a predefined dag like this."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%time\n",
- "with FugueWorkflow(SparkExecutionEngine()) as dag:\n",
- " df = dag.df([[0]],\"a:int\")\n",
- " df = df.transform(just_wait).weak_checkpoint(lazy=True)\n",
- " df.show()\n",
- " df.show()"
+ "with fa.engine_context(spark):\n",
+ " res = fa.transform(df, just_wait, schema=\"*\")\n",
+ " fa.persist(res, lazy=True)"
]
},
{
@@ -180,7 +272,7 @@
"source": [
"### Strong Checkpoint\n",
"\n",
- "Strong checkpoints are in file but non-permanent checkpoints. They are removed after the DAG execution done.\n",
+ "Strong checkpoints are in file but non-permanent checkpoints. They are removed after the execution is completed.\n",
"\n",
"Note Fugue checkpoint is non-permanent but Spark checkpoint is permanent.\n",
"\n",
@@ -189,32 +281,40 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%time\n",
- "engine = SparkExecutionEngine(conf={\"fugue.workflow.checkpoint.path\":\"/tmp\"})\n",
- "with FugueWorkflow(engine) as dag:\n",
- " df = dag.df([[0]],\"a:int\")\n",
- " df = df.transform(just_wait).checkpoint()\n",
- " df.show()\n",
- " df.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+---+---+\n",
+ "| a| b|\n",
+ "+---+---+\n",
+ "| 3| 3|\n",
+ "| 1| 1|\n",
+ "| 2| 2|\n",
+ "+---+---+\n",
+ "\n",
+ "+---+---+\n",
+ "| a| b|\n",
+ "+---+---+\n",
+ "| 3| 3|\n",
+ "| 1| 1|\n",
+ "| 2| 2|\n",
+ "+---+---+\n",
+ "\n",
+ "CPU times: user 36.3 ms, sys: 9.8 ms, total: 46.1 ms\n",
+ "Wall time: 5.57 s\n"
+ ]
+ }
+ ],
"source": [
"%%time\n",
- "engine = SparkExecutionEngine(conf={\"fugue.workflow.checkpoint.path\":\"/tmp\"})\n",
- "with FugueWorkflow(engine) as dag:\n",
- " df = dag.df([[0]],\"a:int\")\n",
- " df = df.transform(just_wait).strong_checkpoint(lazy=True)\n",
- " df.show()\n",
- " df.show()"
+ "with fa.engine_context(spark, engine_conf={\"fugue.workflow.checkpoint.path\":\"/tmp\"}):\n",
+ " res = fa.transform(df, just_wait, schema=\"*\", checkpoint=True)\n",
+ " res.show()\n",
+ " res.show()"
]
},
{
@@ -223,7 +323,7 @@
"source": [
"### Deterministic Checkpoint\n",
"\n",
- "Deterministic checkpoint is mainly for 'resuming'. Expecially when in an interactive environment, you may not want to rerun the whole DAG every time you make a minor change. But note that, practically, you should avoid using deterministic checkpoint in production, also whenever you use deterministic checkpoint, ask yourself, is the dag overly complicated? Can you make it more modulized?\n",
+ "Deterministic checkpoint is mainly for 'resuming'. Expecially when in an interactive environment, you may not want to rerun the whole DAG every time you make a minor change. But note that, practically, you should avoid using deterministic checkpoint in production, also whenever you use deterministic checkpoint, ask yourself, is the dag overly complicated? Can you make it more modularized?\n",
"\n",
"See this example"
]
@@ -319,6 +419,22 @@
"It is common to see you have a good separation of your logic, and they are divided into several DAGs, and you need to pass the dataframes between the DAGs. This can be done by `save` and `load` an explicit file path, but for intermediate data, you don't always want to specify paths for them, also you want determinism to work cross DAGs. This is reason to use `yield_as`"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "query = \"\"\"\n",
+ "TRANSFORM df USING just_wait SCHEMA *\n",
+ "DETERMINISTIC CHECKPOINT\n",
+ "YIELD DATAFRAME AS result\n",
+ "\"\"\"\n",
+ "\n",
+ "res = fa.fugue_sql_flow(query).run(engine=spark, conf={\"fugue.workflow.checkpoint.path\":\"/tmp\"})\n",
+ "res['result']"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
@@ -383,7 +499,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.9"
+ "version": "3.8.13"
},
"vscode": {
"interpreter": {
diff --git a/tutorials/advanced/index.md b/tutorials/advanced/index.md
index 469babdd..dc1f128d 100644
--- a/tutorials/advanced/index.md
+++ b/tutorials/advanced/index.md
@@ -12,6 +12,7 @@ Since you already have experience in Spark or distributed computing in general,
:hidden:
useful_config
+checkpoint
execution_engine
validation
schema_dataframes
@@ -30,8 +31,11 @@ These configurations can have significant impact on building and running the Fug
## [Execution Engine](execution_engine.ipynb)
The heart of Fugue. It is the layer that unifies many of the core concepts of distributed computing, and separates the underlying computing frameworks from user level logic. Normally you don't directly interact with execution engines. But it's good to understand some basics.
+## [Checkpoint](checkpoint.ipynb)
+These are the operations to avoid duplicate computation when working with larger datasets. Fugue has more granular checkpoint operations than Spark.
+
## [Validation](validation.ipynb)
-Fugue applies input validation.
+Fugue applies input validation to make sure the data has the expected format before a transformation is applied.
## [Data Type, Schema & DataFrames](schema_dataframes.ipynb)
Fugue data types and schema are strictly based on [Apache Arrow](https://arrow.apache.org/docs/index.html). Dataframe is an abstract concept with several built-in implementations to adapt to different dataframes. In this tutorial, we will go through the basic APIs and focus on the most common use cases.
diff --git a/tutorials/applications/recipes/loading_databases.ipynb b/tutorials/applications/recipes/loading_databases.ipynb
deleted file mode 100644
index 02f872b7..00000000
--- a/tutorials/applications/recipes/loading_databases.ipynb
+++ /dev/null
@@ -1,141 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Loading database tables\n",
- "\n",
- "We can either use a custom `Creator` load from a database to a `DataFrame` or we can use the database engine service itself to output a file which can then be loaded in `fugue`."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's dump some data into a `sqlite` database & read it in `fugue`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import sqlite3\n",
- "\n",
- "from fugue import FugueWorkflow\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "def create_sqlite_db(path: str, content: str, table: str):\n",
- " uri = f\"file:{path}\"\n",
- " lines = content.split(\"\\n\")\n",
- " headers = lines[0]\n",
- " rows = lines[1:]\n",
- " with sqlite3.connect(uri, uri=True) as con:\n",
- " cur = con.cursor()\n",
- " cur.execute(f\"CREATE TABLE {table}({headers})\")\n",
- " values = \"(\" + '),('.join(row for row in rows) + \")\"\n",
- " cur.execute(f\"INSERT INTO {table} VALUES {values}\")\n",
- " con.commit()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "content = '''\\\n",
- "\"TIMESTAMP\",\"RECORD\",\"WS_80m_90deg_Avg\",\"WS_80m_90deg_Std\",\"WS_80m_90deg_3sGust_Max\",\"WS_80m_90deg_Max\",\"WS_80m_270deg_Avg\",\"WS_80m_270deg_Std\",\"WS_80m_270deg_3sGust_Max\",\"WS_80m_270deg_Max\",\"WS_65m_90deg_Avg\",\"WS_65m_90deg_Std\",\"WS_65m_90deg_3sGust_Max\",\"WS_65m_90deg_Max\",\"WS_65m_270deg_Avg\",\"WS_65m_270deg_Std\",\"WS_65m_270deg_3sGust_Max\",\"WS_65m_270deg_Max\",\"WS_50m_90deg_Avg\",\"WS_50m_90deg_Std\",\"WS_50m_90deg_3sGust_Max\",\"WS_50m_90deg_Max\",\"WS_50m_270deg_Avg\",\"WS_50m_270deg_Std\",\"WS_50m_270deg_3sGust_Max\",\"WS_50m_270deg_Max\",\"WS_30m_90deg_Avg\",\"WS_30m_90deg_Std\",\"WS_30m_90deg_3sGust_Max\",\"WS_30m_90deg_Max\",\"WS_30m_270deg_Avg\",\"WS_30m_270deg_Std\",\"WS_30m_270deg_3sGust_Max\",\"WS_30m_270deg_Max\",\"Dir_78m_90deg_avg\",\"Dir_78m_90deg_std\",\"Dir_63m_90deg_avg\",\"Dir_63m_90deg_std\",\"Dir_28m_90deg_avg\",\"Dir_28m_90deg_std\",\"Batt_Volt_Min\",\"Press_Avg\",\"Temp_C80_Avg\",\"Temp_C15_Avg\",\"Hum_Avg\"\n",
- "\"2012-05-31 12:20:00\",1,1.383,0.6,2.75,3.37,1.368,0.439,2.673,2.74,1.332,0.478,2.75,2.75,1.242,0.379,2.74,2.79,1.162,0.535,2.337,2.75,1.159,0.354,2.34,2.39,1.27,0.614,2.337,2.75,1.322,0.416,2.157,2.24,240.3,46,242,45.39,222,33.45,13.79,1009,13.84,14.08,65.67\n",
- "\"2012-05-31 12:30:00\",2,1.183,0.449,1.923,2.13,1.135,0.324,1.94,1.99,0.948,0.524,1.923,2.13,1.068,0.303,1.723,1.74,0.701,0.547,1.923,2.13,0.913,0.308,1.673,1.74,0.771,0.539,1.717,2.13,0.997,0.28,1.657,1.74,282,26.79,264.3,30.25,278.5,62.87,13.73,1009,14.04,14.45,64.51'''"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "def load_db(table: str, con: str) -> pd.DataFrame:\n",
- " return pd.read_sql_table(table, uri)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "PandasDataFrame\n",
- "TIMESTAMP:str |RECORD:long|WS_80m_90deg_Avg:double|WS_80m_90deg_Std:double|WS_80m_90deg_3sGust_Max:double|WS_80m_90deg_Max:double|WS_80m_270deg_Avg:double|WS_80m_270deg_Std:double|WS_80m_270deg_3sGust_Max:double|WS_80m_270deg_Max:double|WS_65m_90deg_Avg:double|WS_65m_90deg_Std:double|WS_65m_90deg_3sGust_Max:double|WS_65m_90deg_Max:double|WS_65m_270deg_Avg:double|WS_65m_270deg_Std:double|WS_65m_270deg_3sGust_Max:double|WS_65m_270deg_Max:double|WS_50m_90deg_Avg:double|WS_50m_90deg_Std:double|WS_50m_90deg_3sGust_Max:double|WS_50m_90deg_Max:double|WS_50m_270deg_Avg:double|WS_50m_270deg_Std:double|WS_50m_270deg_3sGust_Max:double|WS_50m_270deg_Max:double|WS_30m_90deg_Avg:double|WS_30m_90deg_Std:double|WS_30m_90deg_3sGust_Max:double|WS_30m_90deg_Max:double|WS_30m_270deg_Avg:double|WS_30m_270deg_Std:double|WS_30m_270deg_3sGust_Max:double|WS_30m_270deg_Max:double|Dir_78m_90deg_avg:double|Dir_78m_90deg_std:double|Dir_63m_90deg_avg:double|Dir_63m_90deg_std:double|Dir_28m_90deg_avg:double|Dir_28m_90deg_std:double|Batt_Volt_Min:double|Press_Avg:long|Temp_C80_Avg:double|Temp_C15_Avg:double|Hum_Avg:double\n",
- "--------------+-----------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+--------------------+--------------+-------------------+-------------------+--------------\n",
- "2012-05-31 12:|1 |1.383 |0.6 |2.75 |3.37 |1.368 |0.439 |2.673 |2.74 |1.332 |0.478 |2.75 |2.75 |1.242 |0.379 |2.74 |2.79 |1.162 |0.535 |2.337 |2.75 |1.159 |0.354 |2.34 |2.39 |1.27 |0.614 |2.337 |2.75 |1.322 |0.416 |2.157 |2.24 |240.3 |46.0 |242.0 |45.39 |222.0 |33.45 |13.79 |1009 |13.84 |14.08 |65.67 \n",
- "20:00 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | \n",
- "2012-05-31 12:|2 |1.183 |0.449 |1.923 |2.13 |1.135 |0.324 |1.94 |1.99 |0.948 |0.524 |1.923 |2.13 |1.068 |0.303 |1.723 |1.74 |0.701 |0.547 |1.923 |2.13 |0.913 |0.308 |1.673 |1.74 |0.771 |0.539 |1.717 |2.13 |0.997 |0.28 |1.657 |1.74 |282.0 |26.79 |264.3 |30.25 |278.5 |62.87 |13.73 |1009 |14.04 |14.45 |64.51 \n",
- "30:00 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | \n",
- "Total count: 2\n",
- "\n"
- ]
- }
- ],
- "source": [
- "db = \"/tmp/example.db\"\n",
- "uri = f\"sqlite:///{db}\"\n",
- "create_sqlite_db(db, content, table=\"sensors\")\n",
- "\n",
- "with FugueWorkflow() as dag:\n",
- " df = dag.create(load_db, params={\"table\": \"sensors\", \"con\": uri})\n",
- " df.show()\n",
- "\n",
- "os.unlink(db)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3.10.6 ('fugue')",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.6"
- },
- "orig_nbformat": 4,
- "vscode": {
- "interpreter": {
- "hash": "787dcf4b06485f091e5ea0894a1563cc39da567670ce0a93adb376c1b3122bd1"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/tutorials/applications/recipes/loading_text_files.ipynb b/tutorials/applications/recipes/loading_text_files.ipynb
deleted file mode 100644
index f4b0a8a0..00000000
--- a/tutorials/applications/recipes/loading_text_files.ipynb
+++ /dev/null
@@ -1,360 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Loading text files\n",
- "\n",
- "`fugue` can read text files natively via `load` or by dropping into an execution engine\n",
- "\n",
- "You might find it useful to use the execution engine directly for loading non-standard files or files that are not natively supported by `fugue`. \n",
- "\n",
- "We'll demonstrate `pandas`, `duckdb` & `dask` here"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Here's an example where the headers are on the 2nd line & the data starts on the 5th:"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\"SITENAME\"\n",
- "\"TIMESTAMP\",\"RECORD\",\"WS_80m_90deg_Avg\",\"WS_80m_90deg_Std\",\"WS_80m_90deg_3sGust_Max\",\"WS_80m_90deg_Max\",\"WS_80m_270deg_Avg\",\"WS_80m_270deg_Std\",\"WS_80m_270deg_3sGust_Max\",\"WS_80m_270deg_Max\",\"WS_65m_90deg_Avg\",\"WS_65m_90deg_Std\",\"WS_65m_90deg_3sGust_Max\",\"WS_65m_90deg_Max\",\"WS_65m_270deg_Avg\",\"WS_65m_270deg_Std\",\"WS_65m_270deg_3sGust_Max\",\"WS_65m_270deg_Max\",\"WS_50m_90deg_Avg\",\"WS_50m_90deg_Std\",\"WS_50m_90deg_3sGust_Max\",\"WS_50m_90deg_Max\",\"WS_50m_270deg_Avg\",\"WS_50m_270deg_Std\",\"WS_50m_270deg_3sGust_Max\",\"WS_50m_270deg_Max\",\"WS_30m_90deg_Avg\",\"WS_30m_90deg_Std\",\"WS_30m_90deg_3sGust_Max\",\"WS_30m_90deg_Max\",\"WS_30m_270deg_Avg\",\"WS_30m_270deg_Std\",\"WS_30m_270deg_3sGust_Max\",\"WS_30m_270deg_Max\",\"Dir_78m_90deg_avg\",\"Dir_78m_90deg_std\",\"Dir_63m_90deg_avg\",\"Dir_63m_90deg_std\",\"Dir_28m_90deg_avg\",\"Dir_28m_90deg_std\",\"Batt_Volt_Min\",\"Press_Avg\",\"Temp_C80_Avg\",\"Temp_C15_Avg\",\"Hum_Avg\"\n",
- "\"TS\",\"RN\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"meters/second\",\"meters/second\",\"meters/second\",\"meters/second\",\"Volts\",\"mB\",\"DegC\",\"DegC\",\"%\"\n",
- "\"\",\"\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"WVc\",\"WVc\",\"WVc\",\"WVc\",\"WVc\",\"WVc\",\"Min\",\"Avg\",\"Avg\",\"Avg\",\"Avg\"\n",
- "\"2012-05-31 12:20:00\",1,1.383,0.6,2.75,3.37,1.368,0.439,2.673,2.74,1.332,0.478,2.75,2.75,1.242,0.379,2.74,2.79,1.162,0.535,2.337,2.75,1.159,0.354,2.34,2.39,1.27,0.614,2.337,2.75,1.322,0.416,2.157,2.24,240.3,46,242,45.39,222,33.45,13.79,1009,13.84,14.08,65.67\n",
- "\"2012-05-31 12:30:00\",2,1.183,0.449,1.923,2.13,1.135,0.324,1.94,1.99,0.948,0.524,1.923,2.13,1.068,0.303,1.723,1.74,0.701,0.547,1.923,2.13,0.913,0.308,1.673,1.74,0.771,0.539,1.717,2.13,0.997,0.28,1.657,1.74,282,26.79,264.3,30.25,278.5,62.87,13.73,1009,14.04,14.45,64.51"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "To read it we'll need to create a custom `Creator` in `fugue`!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import typing\n",
- "\n",
- "import duckdb\n",
- "from fugue import DataFrame\n",
- "from fugue import ExecutionEngine\n",
- "from fugue import FugueWorkflow\n",
- "from fugue import NativeExecutionEngine\n",
- "from fugue_dask import DaskExecutionEngine\n",
- "from fugue_duckdb import DuckExecutionEngine\n",
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Standard text files"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "First let's create a sample `standard` text file ..."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "content = \"\"\"\\\n",
- "a,b,c\n",
- "1,2,3\n",
- "1,2,3\"\"\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can read it natively"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "PandasDataFrame\n",
- "a:str|b:str|c:str\n",
- "-----+-----+-----\n",
- "1 |2 |3 \n",
- "1 |2 |3 \n",
- "Total count: 2\n",
- "\n"
- ]
- }
- ],
- "source": [
- "file = \"/tmp/fugue_example_std_1.csv\"\n",
- "with open(file, \"w\") as f:\n",
- " f.write(content)\n",
- "\n",
- "with FugueWorkflow() as dag:\n",
- " df = dag.load(file, header=True)\n",
- " df.show()\n",
- "\n",
- "os.unlink(file)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can read multiple files using a wildcard `*` "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "PandasDataFrame\n",
- "a:str|b:str|c:str\n",
- "-----+-----+-----\n",
- "1 |2 |3 \n",
- "1 |2 |3 \n",
- "1 |2 |3 \n",
- "1 |2 |3 \n",
- "Total count: 4\n",
- "\n"
- ]
- }
- ],
- "source": [
- "file_1 = \"/tmp/fugue_example_std_2.csv\"\n",
- "file_2 = \"/tmp/fugue_example_std_3.csv\"\n",
- "with open(file_1, \"w\") as f1, open(file_2, \"w\") as f2:\n",
- " f1.write(content)\n",
- " f2.write(content)\n",
- "wildcard = \"/tmp/fugue_example_std_*.csv\"\n",
- "\n",
- "with FugueWorkflow() as dag:\n",
- " df = dag.load(wildcard, header=True)\n",
- " df.show()\n",
- "\n",
- "os.unlink(file_1)\n",
- "os.unlink(file_2)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Non-standard text files\n",
- "\n",
- "Or, if your input file is non-standard, we can use the execution engine directly "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "content = '''\\\n",
- "\"SITENAME\"\n",
- "\"TIMESTAMP\",\"RECORD\",\"WS_80m_90deg_Avg\",\"WS_80m_90deg_Std\",\"WS_80m_90deg_3sGust_Max\",\"WS_80m_90deg_Max\",\"WS_80m_270deg_Avg\",\"WS_80m_270deg_Std\",\"WS_80m_270deg_3sGust_Max\",\"WS_80m_270deg_Max\",\"WS_65m_90deg_Avg\",\"WS_65m_90deg_Std\",\"WS_65m_90deg_3sGust_Max\",\"WS_65m_90deg_Max\",\"WS_65m_270deg_Avg\",\"WS_65m_270deg_Std\",\"WS_65m_270deg_3sGust_Max\",\"WS_65m_270deg_Max\",\"WS_50m_90deg_Avg\",\"WS_50m_90deg_Std\",\"WS_50m_90deg_3sGust_Max\",\"WS_50m_90deg_Max\",\"WS_50m_270deg_Avg\",\"WS_50m_270deg_Std\",\"WS_50m_270deg_3sGust_Max\",\"WS_50m_270deg_Max\",\"WS_30m_90deg_Avg\",\"WS_30m_90deg_Std\",\"WS_30m_90deg_3sGust_Max\",\"WS_30m_90deg_Max\",\"WS_30m_270deg_Avg\",\"WS_30m_270deg_Std\",\"WS_30m_270deg_3sGust_Max\",\"WS_30m_270deg_Max\",\"Dir_78m_90deg_avg\",\"Dir_78m_90deg_std\",\"Dir_63m_90deg_avg\",\"Dir_63m_90deg_std\",\"Dir_28m_90deg_avg\",\"Dir_28m_90deg_std\",\"Batt_Volt_Min\",\"Press_Avg\",\"Temp_C80_Avg\",\"Temp_C15_Avg\",\"Hum_Avg\"\n",
- "\"TS\",\"RN\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"\",\"meters/second\",\"meters/second\",\"meters/second\",\"meters/second\",\"meters/second\",\"meters/second\",\"meters/second\",\"Volts\",\"mB\",\"DegC\",\"DegC\",\"%\"\n",
- "\"\",\"\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"Avg\",\"Std\",\"Max\",\"Max\",\"WVc\",\"WVc\",\"WVc\",\"WVc\",\"WVc\",\"WVc\",\"Min\",\"Avg\",\"Avg\",\"Avg\",\"Avg\"\n",
- "\"2012-05-31 12:20:00\",1,1.383,0.6,2.75,3.37,1.368,0.439,2.673,2.74,1.332,0.478,2.75,2.75,1.242,0.379,2.74,2.79,1.162,0.535,2.337,2.75,1.159,0.354,2.34,2.39,1.27,0.614,2.337,2.75,1.322,0.416,2.157,2.24,240.3,46,242,45.39,222,33.45,13.79,1009,13.84,14.08,65.67\n",
- "\"2012-05-31 12:30:00\",2,1.183,0.449,1.923,2.13,1.135,0.324,1.94,1.99,0.948,0.524,1.923,2.13,1.068,0.303,1.723,1.74,0.701,0.547,1.923,2.13,0.913,0.308,1.673,1.74,0.771,0.539,1.717,2.13,0.997,0.28,1.657,1.74,282,26.79,264.3,30.25,278.5,62.87,13.73,1009,14.04,14.45,64.51\n",
- "'''"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's read the headers on the 2nd line separately to loading the text file "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def read_header(filepath: str) -> typing.List[str]:\n",
- " row_1 = pd.read_csv(filepath, skiprows=1, nrows=0).columns\n",
- " header = [row_1[0].replace(\"columns: \", \"\"), *row_1[1:]]\n",
- " return header"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "& specify these headers as the column names of the data that we are loading in\n",
- "\n",
- "> **Note:** `skip` & `columns` for `DuckExecutionEngine` correspond to `skiprows` & `names` for `pandas.read_csv` as `duckdb` `csv` has different conventions "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def read_text_file(engine: ExecutionEngine, filepath: str) -> DataFrame:\n",
- " headers = read_header(filepath)\n",
- " if isinstance(engine, NativeExecutionEngine):\n",
- " # load_df uses pandas.read_csv\n",
- " df = engine.load_df(filepath, infer_schema=True, header=True, skiprows=3, names=headers)\n",
- " elif isinstance(engine, DuckExecutionEngine):\n",
- " # load_df uses duckdb read_csv_auto\n",
- " df = engine.load_df(filepath, infer_schema=True, skip=4, columns=headers)\n",
- " elif isinstance(engine, DaskExecutionEngine):\n",
- " # load_df uses dask.dataframe.read_csv\n",
- " df = engine.load_df(filepath, infer_schema=True, header=True, skiprows=3, names=headers)\n",
- " else:\n",
- " supported_engines = {NativeExecutionEngine, DuckExecutionEngine, DaskExecutionEngine} \n",
- " raise ValueError(f\"Engine {engine} is not supported, must be one of {supported_engines}\")\n",
- " return df"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "%%html\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "PandasDataFrame\n",
- "TIMESTAMP:str |RECORD:long|WS_80m_90deg_Avg:double|WS_80m_90deg_Std:double|WS_80m_90deg_3sGust_Max:double|WS_80m_90deg_Max:double|WS_80m_270deg_Avg:double|WS_80m_270deg_Std:double|WS_80m_270deg_3sGust_Max:double|WS_80m_270deg_Max:double|WS_65m_90deg_Avg:double|WS_65m_90deg_Std:double|WS_65m_90deg_3sGust_Max:double|WS_65m_90deg_Max:double|WS_65m_270deg_Avg:double|WS_65m_270deg_Std:double|WS_65m_270deg_3sGust_Max:double|WS_65m_270deg_Max:double|WS_50m_90deg_Avg:double|WS_50m_90deg_Std:double|WS_50m_90deg_3sGust_Max:double|WS_50m_90deg_Max:double|WS_50m_270deg_Avg:double|WS_50m_270deg_Std:double|WS_50m_270deg_3sGust_Max:double|WS_50m_270deg_Max:double|WS_30m_90deg_Avg:double|WS_30m_90deg_Std:double|WS_30m_90deg_3sGust_Max:double|WS_30m_90deg_Max:double|WS_30m_270deg_Avg:double|WS_30m_270deg_Std:double|WS_30m_270deg_3sGust_Max:double|WS_30m_270deg_Max:double|Dir_78m_90deg_avg:double|Dir_78m_90deg_std:double|Dir_63m_90deg_avg:double|Dir_63m_90deg_std:double|Dir_28m_90deg_avg:double|Dir_28m_90deg_std:double|Batt_Volt_Min:double|Press_Avg:long|Temp_C80_Avg:double|Temp_C15_Avg:double|Hum_Avg:double\n",
- "--------------+-----------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+--------------------+--------------+-------------------+-------------------+--------------\n",
- "2012-05-31 12:|1 |1.383 |0.6 |2.75 |3.37 |1.368 |0.439 |2.673 |2.74 |1.332 |0.478 |2.75 |2.75 |1.242 |0.379 |2.74 |2.79 |1.162 |0.535 |2.337 |2.75 |1.159 |0.354 |2.34 |2.39 |1.27 |0.614 |2.337 |2.75 |1.322 |0.416 |2.157 |2.24 |240.3 |46.0 |242.0 |45.39 |222.0 |33.45 |13.79 |1009 |13.84 |14.08 |65.67 \n",
- "20:00 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | \n",
- "2012-05-31 12:|2 |1.183 |0.449 |1.923 |2.13 |1.135 |0.324 |1.94 |1.99 |0.948 |0.524 |1.923 |2.13 |1.068 |0.303 |1.723 |1.74 |0.701 |0.547 |1.923 |2.13 |0.913 |0.308 |1.673 |1.74 |0.771 |0.539 |1.717 |2.13 |0.997 |0.28 |1.657 |1.74 |282.0 |26.79 |264.3 |30.25 |278.5 |62.87 |13.73 |1009 |14.04 |14.45 |64.51 \n",
- "30:00 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | \n",
- "Total count: 2\n",
- "\n",
- "DuckDataFrame\n",
- "timestamp:datetime |RECORD:int|WS_80m_90deg_Avg:double|WS_80m_90deg_Std:double|WS_80m_90deg_3sGust_Max:double|WS_80m_90deg_Max:double|WS_80m_270deg_Avg:double|WS_80m_270deg_Std:double|WS_80m_270deg_3sGust_Max:double|WS_80m_270deg_Max:double|WS_65m_90deg_Avg:double|WS_65m_90deg_Std:double|WS_65m_90deg_3sGust_Max:double|WS_65m_90deg_Max:double|WS_65m_270deg_Avg:double|WS_65m_270deg_Std:double|WS_65m_270deg_3sGust_Max:double|WS_65m_270deg_Max:double|WS_50m_90deg_Avg:double|WS_50m_90deg_Std:double|WS_50m_90deg_3sGust_Max:double|WS_50m_90deg_Max:double|WS_50m_270deg_Avg:double|WS_50m_270deg_Std:double|WS_50m_270deg_3sGust_Max:double|WS_50m_270deg_Max:double|WS_30m_90deg_Avg:double|WS_30m_90deg_Std:double|WS_30m_90deg_3sGust_Max:double|WS_30m_90deg_Max:double|WS_30m_270deg_Avg:double|WS_30m_270deg_Std:double|WS_30m_270deg_3sGust_Max:double|WS_30m_270deg_Max:double|Dir_78m_90deg_avg:int|Dir_78m_90deg_std:double|Dir_63m_90deg_avg:double|Dir_63m_90deg_std:double|Dir_28m_90deg_avg:double|Dir_28m_90deg_std:double|Batt_Volt_Min:double|Press_Avg:int|Temp_C80_Avg:double|Temp_C15_Avg:double|Hum_Avg:double\n",
- "-------------------+----------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+---------------------+------------------------+------------------------+------------------------+------------------------+------------------------+--------------------+-------------+-------------------+-------------------+--------------\n",
- "2012-05-31 12:20:00|1 |1.383 |0.6 |2.75 |3.37 |1.368 |0.439 |2.673 |2.74 |1.332 |0.478 |2.75 |2.75 |1.242 |0.379 |2.74 |2.79 |1.162 |0.535 |2.337 |2.75 |1.159 |0.354 |2.34 |2.39 |1.27 |0.614 |2.337 |2.75 |1.322 |0.416 |2.157 |2.24 |240 |46.0 |242.0 |45.39 |222.0 |33.45 |13.79 |1009 |13.84 |14.08 |65.67 \n",
- "2012-05-31 12:30:00|2 |1.183 |0.449 |1.923 |2.13 |1.135 |0.324 |1.94 |1.99 |0.948 |0.524 |1.923 |2.13 |1.068 |0.303 |1.723 |1.74 |0.701 |0.547 |1.923 |2.13 |0.913 |0.308 |1.673 |1.74 |0.771 |0.539 |1.717 |2.13 |0.997 |0.28 |1.657 |1.74 |282 |26.79 |264.3 |30.25 |278.5 |62.87 |13.73 |1009 |14.04 |14.45 |64.51 \n",
- "Total count: 2\n",
- "\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "DaskDataFrame\n",
- "TIMESTAMP:str |RECORD:long|WS_80m_90deg_Avg:double|WS_80m_90deg_Std:double|WS_80m_90deg_3sGust_Max:double|WS_80m_90deg_Max:double|WS_80m_270deg_Avg:double|WS_80m_270deg_Std:double|WS_80m_270deg_3sGust_Max:double|WS_80m_270deg_Max:double|WS_65m_90deg_Avg:double|WS_65m_90deg_Std:double|WS_65m_90deg_3sGust_Max:double|WS_65m_90deg_Max:double|WS_65m_270deg_Avg:double|WS_65m_270deg_Std:double|WS_65m_270deg_3sGust_Max:double|WS_65m_270deg_Max:double|WS_50m_90deg_Avg:double|WS_50m_90deg_Std:double|WS_50m_90deg_3sGust_Max:double|WS_50m_90deg_Max:double|WS_50m_270deg_Avg:double|WS_50m_270deg_Std:double|WS_50m_270deg_3sGust_Max:double|WS_50m_270deg_Max:double|WS_30m_90deg_Avg:double|WS_30m_90deg_Std:double|WS_30m_90deg_3sGust_Max:double|WS_30m_90deg_Max:double|WS_30m_270deg_Avg:double|WS_30m_270deg_Std:double|WS_30m_270deg_3sGust_Max:double|WS_30m_270deg_Max:double|Dir_78m_90deg_avg:double|Dir_78m_90deg_std:double|Dir_63m_90deg_avg:double|Dir_63m_90deg_std:double|Dir_28m_90deg_avg:double|Dir_28m_90deg_std:double|Batt_Volt_Min:double|Press_Avg:long|Temp_C80_Avg:double|Temp_C15_Avg:double|Hum_Avg:double\n",
- "--------------+-----------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+-----------------------+-----------------------+------------------------------+-----------------------+------------------------+------------------------+-------------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+------------------------+--------------------+--------------+-------------------+-------------------+--------------\n",
- "2012-05-31 12:|1 |1.383 |0.6 |2.75 |3.37 |1.368 |0.439 |2.673 |2.74 |1.332 |0.478 |2.75 |2.75 |1.242 |0.379 |2.74 |2.79 |1.162 |0.535 |2.337 |2.75 |1.159 |0.354 |2.34 |2.39 |1.27 |0.614 |2.337 |2.75 |1.322 |0.416 |2.157 |2.24 |240.3 |46.0 |242.0 |45.39 |222.0 |33.45 |13.79 |1009 |13.84 |14.08 |65.67 \n",
- "20:00 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | \n",
- "2012-05-31 12:|2 |1.183 |0.449 |1.923 |2.13 |1.135 |0.324 |1.94 |1.99 |0.948 |0.524 |1.923 |2.13 |1.068 |0.303 |1.723 |1.74 |0.701 |0.547 |1.923 |2.13 |0.913 |0.308 |1.673 |1.74 |0.771 |0.539 |1.717 |2.13 |0.997 |0.28 |1.657 |1.74 |282.0 |26.79 |264.3 |30.25 |278.5 |62.87 |13.73 |1009 |14.04 |14.45 |64.51 \n",
- "30:00 | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | \n",
- "Total count: 2\n",
- "\n"
- ]
- }
- ],
- "source": [
- "file = \"/tmp/fugue_example_nonstd.csv\"\n",
- "with open(file, \"w\") as f:\n",
- " f.write(content)\n",
- "\n",
- "with FugueWorkflow() as dag:\n",
- " df = dag.create(read_text_file, params={\"filepath\": file})\n",
- " df.show()\n",
- "\n",
- "with FugueWorkflow(engine=\"duckdb\") as dag:\n",
- " df = dag.create(read_text_file, params={\"filepath\": file})\n",
- " df.show()\n",
- "\n",
- "with FugueWorkflow(engine=\"dask\") as dag:\n",
- " df = dag.create(read_text_file, params={\"filepath\": file})\n",
- " df.show()\n",
- "\n",
- "os.unlink(file)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3.10.6 ('fugue')",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "name": "python",
- "version": "3.10.6"
- },
- "vscode": {
- "interpreter": {
- "hash": "787dcf4b06485f091e5ea0894a1563cc39da567670ce0a93adb376c1b3122bd1"
- }
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}