Spaces:
Sleeping
Sleeping
Add Conclusion section and mention DuckDB polars support
Browse files- polars/03_loading_data.py +109 -11
polars/03_loading_data.py
CHANGED
|
@@ -2,10 +2,11 @@
|
|
| 2 |
# requires-python = ">=3.12"
|
| 3 |
# dependencies = [
|
| 4 |
# "adbc-driver-sqlite==1.6.0",
|
| 5 |
-
# "
|
|
|
|
| 6 |
# "marimo",
|
| 7 |
-
# "pandas==2.
|
| 8 |
-
# "polars==1.
|
| 9 |
# "pyarrow==20.0.0",
|
| 10 |
# "sqlalchemy==2.0.41",
|
| 11 |
# ]
|
|
@@ -13,7 +14,7 @@
|
|
| 13 |
|
| 14 |
import marimo
|
| 15 |
|
| 16 |
-
__generated_with = "0.14.
|
| 17 |
app = marimo.App(width="medium")
|
| 18 |
|
| 19 |
|
|
@@ -260,7 +261,6 @@ def _(mo):
|
|
| 260 |
- You are expected to provide a Schema before the Generator starts
|
| 261 |
- - For many use cases the Plugin may be able to infer it, but you could also pass it explicitly to the plugin function
|
| 262 |
- Ideally you should parse some of the filters and column selectors to avoid unnecessary work, but it is possible to delegate that to polars after loading the data in order to keep it simpler (at the cost of efficiency)
|
| 263 |
-
|
| 264 |
"""
|
| 265 |
)
|
| 266 |
return
|
|
@@ -333,6 +333,53 @@ def _(Iterator, get_positional_names, itertools, pl, register_io_source):
|
|
| 333 |
return (my_custom_input_plugin,)
|
| 334 |
|
| 335 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
@app.cell(hide_code=True)
|
| 337 |
def _(mo):
|
| 338 |
mo.md(
|
|
@@ -445,6 +492,32 @@ def _(adlfs, df, os, pl):
|
|
| 445 |
return
|
| 446 |
|
| 447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
@app.cell
|
| 449 |
def _():
|
| 450 |
import marimo as mo
|
|
@@ -482,13 +555,9 @@ def _():
|
|
| 482 |
def _():
|
| 483 |
import polars as pl
|
| 484 |
import pandas as pd
|
| 485 |
-
return pd, pl
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
@app.cell
|
| 489 |
-
def _():
|
| 490 |
from polars.io.plugins import register_io_source
|
| 491 |
-
|
|
|
|
| 492 |
|
| 493 |
|
| 494 |
@app.cell
|
|
@@ -505,5 +574,34 @@ def _(itertools, string):
|
|
| 505 |
return (get_positional_names,)
|
| 506 |
|
| 507 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
if __name__ == "__main__":
|
| 509 |
app.run()
|
|
|
|
| 2 |
# requires-python = ">=3.12"
|
| 3 |
# dependencies = [
|
| 4 |
# "adbc-driver-sqlite==1.6.0",
|
| 5 |
+
# "duckdb==1.3.1",
|
| 6 |
+
# "lxml==6.0.0",
|
| 7 |
# "marimo",
|
| 8 |
+
# "pandas==2.3.0",
|
| 9 |
+
# "polars==1.31.0",
|
| 10 |
# "pyarrow==20.0.0",
|
| 11 |
# "sqlalchemy==2.0.41",
|
| 12 |
# ]
|
|
|
|
| 14 |
|
| 15 |
import marimo
|
| 16 |
|
| 17 |
+
__generated_with = "0.14.8"
|
| 18 |
app = marimo.App(width="medium")
|
| 19 |
|
| 20 |
|
|
|
|
| 261 |
- You are expected to provide a Schema before the Generator starts
|
| 262 |
- - For many use cases the Plugin may be able to infer it, but you could also pass it explicitly to the plugin function
|
| 263 |
- Ideally you should parse some of the filters and column selectors to avoid unnecessary work, but it is possible to delegate that to polars after loading the data in order to keep it simpler (at the cost of efficiency)
|
|
|
|
| 264 |
"""
|
| 265 |
)
|
| 266 |
return
|
|
|
|
| 333 |
return (my_custom_input_plugin,)
|
| 334 |
|
| 335 |
|
| 336 |
+
@app.cell(hide_code=True)
|
| 337 |
+
def _(mo):
|
| 338 |
+
mo.md(
|
| 339 |
+
r"""
|
| 340 |
+
### DuckDB
|
| 341 |
+
|
| 342 |
+
In addition to Arrow interoperability support, [DuckDB](https://duckdb.org/) has also added support for loading query results into a polars LazyFrame
|
| 343 |
+
|
| 344 |
+
You can read more about polars and duckdb integrations in
|
| 345 |
+
|
| 346 |
+
- https://docs.pola.rs/user-guide/ecosystem/#duckdb
|
| 347 |
+
- https://duckdb.org/docs/stable/guides/python/polars.html
|
| 348 |
+
|
| 349 |
+
You can learn more about DuckDB in the marimo course about it as well, including Marimo SQL related features
|
| 350 |
+
"""
|
| 351 |
+
)
|
| 352 |
+
return
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
@app.cell
|
| 356 |
+
def _():
|
| 357 |
+
# Amazing if you need of features not yet supported by Polars such as geospatial data
|
| 358 |
+
duckdb_query = """
|
| 359 |
+
SELECT
|
| 360 |
+
id,
|
| 361 |
+
name,
|
| 362 |
+
ST_X(geometry) as longitude,
|
| 363 |
+
ST_Y(geometry) as latitude
|
| 364 |
+
FROM locations
|
| 365 |
+
"""
|
| 366 |
+
return (duckdb_query,)
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
@app.cell
|
| 370 |
+
def _(duckdb_conn, duckdb_query):
|
| 371 |
+
# Eager (default):
|
| 372 |
+
duckdb_conn.execute(duckdb_query).pl()
|
| 373 |
+
return
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
@app.cell(disabled=True)
|
| 377 |
+
def _(duckdb_conn, duckdb_query):
|
| 378 |
+
# Lazy (merged but not yet released as of the time I am writing this, requires > 1.3.1):
|
| 379 |
+
duckdb_conn.execute(duckdb_query).pl(lazy=True)
|
| 380 |
+
return
|
| 381 |
+
|
| 382 |
+
|
| 383 |
@app.cell(hide_code=True)
|
| 384 |
def _(mo):
|
| 385 |
mo.md(
|
|
|
|
| 492 |
return
|
| 493 |
|
| 494 |
|
| 495 |
+
@app.cell(hide_code=True)
|
| 496 |
+
def _(mo):
|
| 497 |
+
mo.md(
|
| 498 |
+
r"""
|
| 499 |
+
## Conclusion
|
| 500 |
+
As you have seen, polars makes it easy to work with a variety of formats and different data sources.
|
| 501 |
+
|
| 502 |
+
From natively supported formats such as Parquet and CSV files, to using other libraries as an intermediary for XML or geospatial data, and plugins for newly emerging or proprietary formats, as long as your data can fit in a table then odds are you can turn it into a polars DataFrame.
|
| 503 |
+
|
| 504 |
+
Combined with loading directly from remote sources, including public data platforms such as Hugging Face and Kaggle as well as private data in your cloud, you can import datasets for almost anything you can imagine.
|
| 505 |
+
"""
|
| 506 |
+
)
|
| 507 |
+
return
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
@app.cell(hide_code=True)
|
| 511 |
+
def _(mo):
|
| 512 |
+
mo.md(
|
| 513 |
+
r"""
|
| 514 |
+
## Utilities
|
| 515 |
+
Imports, utility functions and alike used through the Notebook
|
| 516 |
+
"""
|
| 517 |
+
)
|
| 518 |
+
return
|
| 519 |
+
|
| 520 |
+
|
| 521 |
@app.cell
|
| 522 |
def _():
|
| 523 |
import marimo as mo
|
|
|
|
| 555 |
def _():
|
| 556 |
import polars as pl
|
| 557 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
from polars.io.plugins import register_io_source
|
| 559 |
+
import duckdb
|
| 560 |
+
return duckdb, pd, pl, register_io_source
|
| 561 |
|
| 562 |
|
| 563 |
@app.cell
|
|
|
|
| 574 |
return (get_positional_names,)
|
| 575 |
|
| 576 |
|
| 577 |
+
@app.cell
|
| 578 |
+
def _(duckdb):
|
| 579 |
+
# Connect to an ephemeral in-memory DuckDB database
|
| 580 |
+
duckdb_conn = duckdb.connect(":memory:")
|
| 581 |
+
|
| 582 |
+
# Install and load the spatial extension for geometry support
|
| 583 |
+
duckdb_conn.load_extension("spatial")
|
| 584 |
+
|
| 585 |
+
# Create a table with geometry column
|
| 586 |
+
duckdb_conn.execute("""
|
| 587 |
+
CREATE TABLE locations (
|
| 588 |
+
id INTEGER,
|
| 589 |
+
name VARCHAR,
|
| 590 |
+
geometry GEOMETRY
|
| 591 |
+
)
|
| 592 |
+
""")
|
| 593 |
+
|
| 594 |
+
# Insert some sample data with geometry points
|
| 595 |
+
duckdb_conn.execute("""
|
| 596 |
+
INSERT INTO locations VALUES
|
| 597 |
+
(1, 'New York', ST_Point(-74.0059, 40.7128)),
|
| 598 |
+
(2, 'Los Angeles', ST_Point(-118.2437, 34.0522)),
|
| 599 |
+
(3, 'Chicago', ST_Point(-87.6298, 41.8781)),
|
| 600 |
+
(4, 'Houston', ST_Point(-95.3698, 29.7604)),
|
| 601 |
+
(5, 'Phoenix', ST_Point(-112.0740, 33.4484))
|
| 602 |
+
""")
|
| 603 |
+
return (duckdb_conn,)
|
| 604 |
+
|
| 605 |
+
|
| 606 |
if __name__ == "__main__":
|
| 607 |
app.run()
|