Add python bindings for new dataframe APIs (#7357)

### What - First pass at implementing APIs for: #7455 - Introduces a new mechanism for directly exposing rust types into the python bridge via a .pyi definition Example notebook for testing ``` pixi run py-build-examples pixi run -e examples jupyter notebook tests/python/dataframe/examples.ipynb ``` ### Future work: - More docs / help strings - Remaining API features ### Checklist * [x] I have read and agree to [Contributor Guide](https://github.com/rerun-io/rerun/blob/main/CONTRIBUTING.md) and the [Code of Conduct](https://github.com/rerun-io/rerun/blob/main/CODE_OF_CONDUCT.md) * [x] I've included a screenshot or gif (if applicable) * [x] I have tested the web demo (if applicable): * Using examples from latest `main` build: [rerun.io/viewer](https://rerun.io/viewer/pr/7357?manifest_url=https://app.rerun.io/version/main/examples_manifest.json) * Using full set of examples from `nightly` build: [rerun.io/viewer](https://rerun.io/viewer/pr/7357?manifest_url=https://app.rerun.io/version/nightly/examples_manifest.json) * [x] The PR title and labels are set such as to maximize their usefulness for the next release's CHANGELOG * [x] If applicable, add a new check to the [release checklist](https://github.com/rerun-io/rerun/blob/main/tests/python/release_checklist)! * [x] If have noted any breaking changes to the log API in `CHANGELOG.md` and the migration guide - [PR Build Summary](https://build.rerun.io/pr/7357) - [Recent benchmark results](https://build.rerun.io/graphs/crates.html) - [Wasm size tracking](https://build.rerun.io/graphs/sizes.html) To run all checks from `main`, comment on the PR with `@rerun-bot full-check`.
rerun-io · Oct 4, 2024 · 5da39a5 · 5da39a5
1 parent 2408689
commit 5da39a5
Show file tree

Hide file tree

Showing 13 changed files with 1,214 additions and 2 deletions.
diff --git a/Cargo.lock b/Cargo.lock
@@ -6237,10 +6237,15 @@ dependencies = [
  "re_build_info",
  "re_build_tools",
  "re_chunk",
+ "re_chunk_store",
+ "re_dataframe2",
+ "re_entity_db",
  "re_log",
+ "re_log_encoding",
  "re_log_types",
  "re_memory",
  "re_sdk",
+ "re_types",
  "re_video",
  "re_web_viewer_server",
  "re_ws_comms",

diff --git a/crates/store/re_chunk_store/src/dataframe.rs b/crates/store/re_chunk_store/src/dataframe.rs
@@ -193,12 +193,13 @@ impl Ord for TimeColumnDescriptor {
 
 impl TimeColumnDescriptor {
     #[inline]
+    // Time column must be nullable since static data doesn't have a time.
     pub fn to_arrow_field(&self) -> ArrowField {
         let Self { timeline, datatype } = self;
         ArrowField::new(
             timeline.name().to_string(),
             datatype.clone(),
-            false, /* nullable */
+            true, /* nullable */
         )
     }
 }
@@ -337,6 +338,11 @@ impl ComponentColumnDescriptor {
         }
     }
 
+    #[inline]
+    pub fn matches(&self, entity_path: &EntityPath, component_name: &ComponentName) -> bool {
+        &self.entity_path == entity_path && &self.component_name == component_name
+    }
+
     fn metadata(&self) -> arrow2::datatypes::Metadata {
         let Self {
             entity_path,

diff --git a/rerun_py/Cargo.toml b/rerun_py/Cargo.toml
@@ -41,15 +41,21 @@ web_viewer = [
 
 [dependencies]
 re_build_info.workspace = true
-re_chunk.workspace = true
+re_chunk = { workspace = true, features = ["arrow"] }
+re_chunk_store = { workspace = true }
+re_dataframe2 = { workspace = true }
+re_entity_db = { workspace = true }
 re_log = { workspace = true, features = ["setup"] }
+re_log_encoding = { workspace = true }
 re_log_types.workspace = true
 re_memory.workspace = true
 re_sdk = { workspace = true, features = ["data_loaders"] }
+re_types = { workspace = true }
 re_video.workspace = true
 re_web_viewer_server = { workspace = true, optional = true }
 re_ws_comms = { workspace = true, optional = true }
 
+
 arrow = { workspace = true, features = ["pyarrow"] }
 arrow2 = { workspace = true, features = ["io_ipc", "io_print", "arrow"] }
 crossbeam.workspace = true

diff --git a/rerun_py/rerun_bindings/__init__.py b/rerun_py/rerun_bindings/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, TypeAlias, Union
+
+from .rerun_bindings import *
diff --git a/rerun_py/rerun_bindings/py.typed b/rerun_py/rerun_bindings/py.typed
diff --git a/rerun_py/rerun_bindings/rerun_bindings.pyi b/rerun_py/rerun_bindings/rerun_bindings.pyi
@@ -0,0 +1,107 @@
+from typing import Optional, Sequence
+
+import pyarrow as pa
+
+from .types import AnyColumn, ComponentLike, ViewContentsLike
+
+class ControlColumnDescriptor:
+    """A control-level column such as `RowId`."""
+
+class ControlColumnSelector:
+    """A selector for a control column."""
+
+    @staticmethod
+    def row_id() -> ControlColumnSelector: ...
+
+class IndexColumnDescriptor:
+    """A column containing the index values for when the component data was updated."""
+
+class IndexColumnSelector:
+    """A selector for an index column."""
+
+    def __init__(self, timeline: str): ...
+
+class ComponentColumnDescriptor:
+    """A column containing the component data."""
+
+    def with_dictionary_encoding(self) -> ComponentColumnDescriptor: ...
+
+class ComponentColumnSelector:
+    """A selector for a component column."""
+
+    def __new__(cls, entity_path: str, component_type: ComponentLike): ...
+    def with_dictionary_encoding(self) -> ComponentColumnSelector: ...
+
+class Schema:
+    """The schema representing all columns in a [`Recording`][]."""
+
+    def control_columns(self) -> list[ControlColumnDescriptor]: ...
+    def index_columns(self) -> list[IndexColumnDescriptor]: ...
+    def component_columns(self) -> list[ComponentColumnDescriptor]: ...
+    def column_for(self, entity_path: str, component: ComponentLike) -> Optional[ComponentColumnDescriptor]: ...
+
+class RecordingView:
+    """
+    A view of a recording restricted to a given index, containing a specific set of entities and components.
+
+    Can only be created by calling `view(...)` on a `Recording`.
+
+    The only type of index currently supported is the name of a timeline.
+
+    The view will only contain a single row for each unique value of the index. If the same entity / component pair
+    was logged to a given index multiple times, only the most recent row will be included in the view, as determined
+    by the `row_id` column. This will generally be the last value logged, as row_ids are guaranteed to be monotonically
+    increasing when data is sent from a single process.
+    """
+
+    def filter_range_sequence(self, start: int, end: int) -> RecordingView:
+        """Filter the view to only include data between the given index sequence numbers."""
+        ...
+
+    def filter_range_seconds(self, start: float, end: float) -> RecordingView:
+        """Filter the view to only include data between the given index time values."""
+        ...
+
+    def filter_range_nanos(self, start: int, end: int) -> RecordingView:
+        """Filter the view to only include data between the given index time values."""
+        ...
+
+    def select(self, columns: Sequence[AnyColumn]) -> list[pa.RecordBatch]: ...
+
+class Recording:
+    """A single recording."""
+
+    def schema(self) -> Schema: ...
+    def view(self, index: str, contents: ViewContentsLike) -> RecordingView: ...
+
+class RRDArchive:
+    """An archive loaded from an RRD, typically containing 1 or more recordings or blueprints."""
+
+    def num_recordings(self) -> int: ...
+    def all_recordings(self) -> list[Recording]: ...
+
+def load_recording(filename: str) -> Recording:
+    """
+    Load a single recording from an RRD.
+
+    Will raise a `ValueError` if the file does not contain exactly one recording.
+
+    Parameters
+    ----------
+    filename : str
+        The path to the file to load.
+
+    """
+    ...
+
+def load_archive(filename: str) -> RRDArchive:
+    """
+    Load a rerun archive file from disk.
+
+    Parameters
+    ----------
+    filename : str
+        The path to the file to load.
+
+    """
+    ...
diff --git a/rerun_py/rerun_bindings/types.py b/rerun_py/rerun_bindings/types.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Sequence, TypeAlias, Union
+
+if TYPE_CHECKING:
+    from rerun._baseclasses import ComponentMixin
+
+    from .rerun_bindings import (
+        ComponentColumnDescriptor as ComponentColumnDescriptor,
+        ComponentColumnSelector as ComponentColumnSelector,
+        ControlColumnDescriptor as ControlColumnDescriptor,
+        ControlColumnSelector as ControlColumnSelector,
+        TimeColumnDescriptor as TimeColumnDescriptor,
+        TimeColumnSelector as TimeColumnSelector,
+    )
+
+
+ComponentLike: TypeAlias = Union[str, type["ComponentMixin"]]
+
+AnyColumn: TypeAlias = Union[
+    "ControlColumnDescriptor",
+    "TimeColumnDescriptor",
+    "ComponentColumnDescriptor",
+    "ControlColumnSelector",
+    "TimeColumnSelector",
+    "ComponentColumnSelector",
+]
+
+AnyComponentColumn: TypeAlias = Union[
+    "ComponentColumnDescriptor",
+    "ComponentColumnSelector",
+]
+
+ViewContentsLike: TypeAlias = Union[
+    str,
+    dict[str, Union[AnyColumn, Sequence[ComponentLike]]],
+]
diff --git a/rerun_py/rerun_sdk/rerun/__init__.py b/rerun_py/rerun_sdk/rerun/__init__.py
@@ -19,6 +19,7 @@
 
 from . import (
     blueprint as blueprint,
+    dataframe as dataframe,
     experimental as experimental,
     notebook as notebook,
 )

diff --git a/rerun_py/rerun_sdk/rerun/dataframe.py b/rerun_py/rerun_sdk/rerun/dataframe.py
@@ -0,0 +1,20 @@
+from __future__ import annotations
+
+from rerun_bindings import (
+    ComponentColumnDescriptor as ComponentColumnDescriptor,
+    ComponentColumnSelector as ComponentColumnSelector,
+    ControlColumnDescriptor as ControlColumnDescriptor,
+    ControlColumnSelector as ControlColumnSelector,
+    Recording as Recording,
+    RRDArchive as RRDArchive,
+    Schema as Schema,
+    TimeColumnDescriptor as TimeColumnDescriptor,
+    TimeColumnSelector as TimeColumnSelector,
+    load_archive as load_archive,
+    load_recording as load_recording,
+)
+from rerun_bindings.types import (
+    AnyColumn as AnyColumn,
+    AnyComponentColumn as AnyComponentColumn,
+    ComponentLike as ComponentLike,
+)