etrotta commited on
Commit
7bcccc1
·
1 Parent(s): 59561cb

Add Conclusion section and mention DuckDB polars support

Browse files
Files changed (1) hide show
  1. polars/03_loading_data.py +109 -11
polars/03_loading_data.py CHANGED
@@ -2,10 +2,11 @@
2
  # requires-python = ">=3.12"
3
  # dependencies = [
4
  # "adbc-driver-sqlite==1.6.0",
5
- # "lxml==5.4.0",
 
6
  # "marimo",
7
- # "pandas==2.2.3",
8
- # "polars==1.30.0",
9
  # "pyarrow==20.0.0",
10
  # "sqlalchemy==2.0.41",
11
  # ]
@@ -13,7 +14,7 @@
13
 
14
  import marimo
15
 
16
- __generated_with = "0.14.7"
17
  app = marimo.App(width="medium")
18
 
19
 
@@ -260,7 +261,6 @@ def _(mo):
260
  - You are expected to provide a Schema before the Generator starts
261
  - - For many use cases the Plugin may be able to infer it, but you could also pass it explicitly to the plugin function
262
  - Ideally you should parse some of the filters and column selectors to avoid unnecessary work, but it is possible to delegate that to polars after loading the data in order to keep it simpler (at the cost of efficiency)
263
-
264
  """
265
  )
266
  return
@@ -333,6 +333,53 @@ def _(Iterator, get_positional_names, itertools, pl, register_io_source):
333
  return (my_custom_input_plugin,)
334
 
335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  @app.cell(hide_code=True)
337
  def _(mo):
338
  mo.md(
@@ -445,6 +492,32 @@ def _(adlfs, df, os, pl):
445
  return
446
 
447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  @app.cell
449
  def _():
450
  import marimo as mo
@@ -482,13 +555,9 @@ def _():
482
  def _():
483
  import polars as pl
484
  import pandas as pd
485
- return pd, pl
486
-
487
-
488
- @app.cell
489
- def _():
490
  from polars.io.plugins import register_io_source
491
- return (register_io_source,)
 
492
 
493
 
494
  @app.cell
@@ -505,5 +574,34 @@ def _(itertools, string):
505
  return (get_positional_names,)
506
 
507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
  if __name__ == "__main__":
509
  app.run()
 
2
  # requires-python = ">=3.12"
3
  # dependencies = [
4
  # "adbc-driver-sqlite==1.6.0",
5
+ # "duckdb==1.3.1",
6
+ # "lxml==6.0.0",
7
  # "marimo",
8
+ # "pandas==2.3.0",
9
+ # "polars==1.31.0",
10
  # "pyarrow==20.0.0",
11
  # "sqlalchemy==2.0.41",
12
  # ]
 
14
 
15
  import marimo
16
 
17
+ __generated_with = "0.14.8"
18
  app = marimo.App(width="medium")
19
 
20
 
 
261
  - You are expected to provide a Schema before the Generator starts
262
  - - For many use cases the Plugin may be able to infer it, but you could also pass it explicitly to the plugin function
263
  - Ideally you should parse some of the filters and column selectors to avoid unnecessary work, but it is possible to delegate that to polars after loading the data in order to keep it simpler (at the cost of efficiency)
 
264
  """
265
  )
266
  return
 
333
  return (my_custom_input_plugin,)
334
 
335
 
336
+ @app.cell(hide_code=True)
337
+ def _(mo):
338
+ mo.md(
339
+ r"""
340
+ ### DuckDB
341
+
342
+ In addition to Arrow interoperability support, [DuckDB](https://duckdb.org/) has also added support for loading query results into a polars LazyFrame
343
+
344
+ You can read more about polars and duckdb integrations in
345
+
346
+ - https://docs.pola.rs/user-guide/ecosystem/#duckdb
347
+ - https://duckdb.org/docs/stable/guides/python/polars.html
348
+
349
+ You can learn more about DuckDB in the marimo course about it as well, including Marimo SQL related features
350
+ """
351
+ )
352
+ return
353
+
354
+
355
+ @app.cell
356
+ def _():
357
+ # Amazing if you need of features not yet supported by Polars such as geospatial data
358
+ duckdb_query = """
359
+ SELECT
360
+ id,
361
+ name,
362
+ ST_X(geometry) as longitude,
363
+ ST_Y(geometry) as latitude
364
+ FROM locations
365
+ """
366
+ return (duckdb_query,)
367
+
368
+
369
+ @app.cell
370
+ def _(duckdb_conn, duckdb_query):
371
+ # Eager (default):
372
+ duckdb_conn.execute(duckdb_query).pl()
373
+ return
374
+
375
+
376
+ @app.cell(disabled=True)
377
+ def _(duckdb_conn, duckdb_query):
378
+ # Lazy (merged but not yet released as of the time I am writing this, requires > 1.3.1):
379
+ duckdb_conn.execute(duckdb_query).pl(lazy=True)
380
+ return
381
+
382
+
383
  @app.cell(hide_code=True)
384
  def _(mo):
385
  mo.md(
 
492
  return
493
 
494
 
495
+ @app.cell(hide_code=True)
496
+ def _(mo):
497
+ mo.md(
498
+ r"""
499
+ ## Conclusion
500
+ As you have seen, polars makes it easy to work with a variety of formats and different data sources.
501
+
502
+ From natively supported formats such as Parquet and CSV files, to using other libraries as an intermediary for XML or geospatial data, and plugins for newly emerging or proprietary formats, as long as your data can fit in a table then odds are you can turn it into a polars DataFrame.
503
+
504
+ Combined with loading directly from remote sources, including public data platforms such as Hugging Face and Kaggle as well as private data in your cloud, you can import datasets for almost anything you can imagine.
505
+ """
506
+ )
507
+ return
508
+
509
+
510
+ @app.cell(hide_code=True)
511
+ def _(mo):
512
+ mo.md(
513
+ r"""
514
+ ## Utilities
515
+ Imports, utility functions and alike used through the Notebook
516
+ """
517
+ )
518
+ return
519
+
520
+
521
  @app.cell
522
  def _():
523
  import marimo as mo
 
555
  def _():
556
  import polars as pl
557
  import pandas as pd
 
 
 
 
 
558
  from polars.io.plugins import register_io_source
559
+ import duckdb
560
+ return duckdb, pd, pl, register_io_source
561
 
562
 
563
  @app.cell
 
574
  return (get_positional_names,)
575
 
576
 
577
+ @app.cell
578
+ def _(duckdb):
579
+ # Connect to an ephemeral in-memory DuckDB database
580
+ duckdb_conn = duckdb.connect(":memory:")
581
+
582
+ # Install and load the spatial extension for geometry support
583
+ duckdb_conn.load_extension("spatial")
584
+
585
+ # Create a table with geometry column
586
+ duckdb_conn.execute("""
587
+ CREATE TABLE locations (
588
+ id INTEGER,
589
+ name VARCHAR,
590
+ geometry GEOMETRY
591
+ )
592
+ """)
593
+
594
+ # Insert some sample data with geometry points
595
+ duckdb_conn.execute("""
596
+ INSERT INTO locations VALUES
597
+ (1, 'New York', ST_Point(-74.0059, 40.7128)),
598
+ (2, 'Los Angeles', ST_Point(-118.2437, 34.0522)),
599
+ (3, 'Chicago', ST_Point(-87.6298, 41.8781)),
600
+ (4, 'Houston', ST_Point(-95.3698, 29.7604)),
601
+ (5, 'Phoenix', ST_Point(-112.0740, 33.4484))
602
+ """)
603
+ return (duckdb_conn,)
604
+
605
+
606
  if __name__ == "__main__":
607
  app.run()