aboutsummaryrefslogtreecommitdiff
path: root/venv/lib/python3.8/site-packages/narwhals/dataframe.py
diff options
context:
space:
mode:
authorsotech117 <michael_foiani@brown.edu>2025-07-31 17:27:24 -0400
committersotech117 <michael_foiani@brown.edu>2025-07-31 17:27:24 -0400
commit5bf22fc7e3c392c8bd44315ca2d06d7dca7d084e (patch)
tree8dacb0f195df1c0788d36dd0064f6bbaa3143ede /venv/lib/python3.8/site-packages/narwhals/dataframe.py
parentb832d364da8c2efe09e3f75828caf73c50d01ce3 (diff)
add code for analysis of data
Diffstat (limited to 'venv/lib/python3.8/site-packages/narwhals/dataframe.py')
-rw-r--r--venv/lib/python3.8/site-packages/narwhals/dataframe.py3234
1 files changed, 3234 insertions, 0 deletions
diff --git a/venv/lib/python3.8/site-packages/narwhals/dataframe.py b/venv/lib/python3.8/site-packages/narwhals/dataframe.py
new file mode 100644
index 0000000..b0ff471
--- /dev/null
+++ b/venv/lib/python3.8/site-packages/narwhals/dataframe.py
@@ -0,0 +1,3234 @@
+from __future__ import annotations
+
+from abc import abstractmethod
+from itertools import chain
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Generic,
+ Iterable,
+ Iterator,
+ Literal,
+ NoReturn,
+ Sequence,
+ TypeVar,
+ overload,
+)
+from warnings import warn
+
+from narwhals._expression_parsing import (
+ ExprKind,
+ all_exprs_are_scalar_like,
+ check_expressions_preserve_length,
+ is_scalar_like,
+)
+from narwhals._utils import (
+ Implementation,
+ find_stacklevel,
+ flatten,
+ generate_repr,
+ is_compliant_dataframe,
+ is_compliant_lazyframe,
+ is_index_selector,
+ is_list_of,
+ is_sequence_like,
+ is_slice_none,
+ issue_deprecation_warning,
+ parse_version,
+ supports_arrow_c_stream,
+)
+from narwhals.dependencies import get_polars, is_numpy_array
+from narwhals.exceptions import (
+ InvalidIntoExprError,
+ LengthChangingExprError,
+ OrderDependentExprError,
+)
+from narwhals.schema import Schema
+from narwhals.series import Series
+from narwhals.translate import to_native
+
+if TYPE_CHECKING:
+ from io import BytesIO
+ from pathlib import Path
+ from types import ModuleType
+
+ import pandas as pd
+ import polars as pl
+ import pyarrow as pa
+ from typing_extensions import Concatenate, ParamSpec, Self, TypeAlias
+
+ from narwhals._compliant import CompliantDataFrame, CompliantLazyFrame
+ from narwhals._compliant.typing import CompliantExprAny, EagerNamespaceAny
+ from narwhals.group_by import GroupBy, LazyGroupBy
+ from narwhals.typing import (
+ AsofJoinStrategy,
+ IntoDataFrame,
+ IntoExpr,
+ IntoFrame,
+ JoinStrategy,
+ LazyUniqueKeepStrategy,
+ MultiColSelector as _MultiColSelector,
+ MultiIndexSelector as _MultiIndexSelector,
+ PivotAgg,
+ SingleColSelector,
+ SingleIndexSelector,
+ SizeUnit,
+ UniqueKeepStrategy,
+ _2DArray,
+ )
+
+ PS = ParamSpec("PS")
+
+_FrameT = TypeVar("_FrameT", bound="IntoFrame")
+FrameT = TypeVar("FrameT", bound="IntoFrame")
+DataFrameT = TypeVar("DataFrameT", bound="IntoDataFrame")
+R = TypeVar("R")
+
+MultiColSelector: TypeAlias = "_MultiColSelector[Series[Any]]"
+MultiIndexSelector: TypeAlias = "_MultiIndexSelector[Series[Any]]"
+
+
+class BaseFrame(Generic[_FrameT]):
+ _compliant_frame: Any
+ _level: Literal["full", "lazy", "interchange"]
+
+ def __native_namespace__(self) -> ModuleType:
+ return self._compliant_frame.__native_namespace__() # type: ignore[no-any-return]
+
+ def __narwhals_namespace__(self) -> Any:
+ return self._compliant_frame.__narwhals_namespace__()
+
+ def _with_compliant(self, df: Any) -> Self:
+ # construct, preserving properties
+ return self.__class__(df, level=self._level) # type: ignore[call-arg]
+
+ def _flatten_and_extract(
+ self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
+ ) -> tuple[list[CompliantExprAny], list[ExprKind]]:
+ """Process `args` and `kwargs`, extracting underlying objects as we go, interpreting strings as column names."""
+ out_exprs = []
+ out_kinds = []
+ for expr in flatten(exprs):
+ compliant_expr = self._extract_compliant(expr)
+ out_exprs.append(compliant_expr)
+ out_kinds.append(ExprKind.from_into_expr(expr, str_as_lit=False))
+ for alias, expr in named_exprs.items():
+ compliant_expr = self._extract_compliant(expr).alias(alias)
+ out_exprs.append(compliant_expr)
+ out_kinds.append(ExprKind.from_into_expr(expr, str_as_lit=False))
+ return out_exprs, out_kinds
+
+ @abstractmethod
+ def _extract_compliant(self, arg: Any) -> Any:
+ raise NotImplementedError
+
+ @property
+ def schema(self) -> Schema:
+ return Schema(self._compliant_frame.schema.items())
+
+ def collect_schema(self) -> Schema:
+ native_schema = dict(self._compliant_frame.collect_schema())
+
+ return Schema(native_schema)
+
+ def pipe(
+ self,
+ function: Callable[Concatenate[Self, PS], R],
+ *args: PS.args,
+ **kwargs: PS.kwargs,
+ ) -> R:
+ return function(self, *args, **kwargs)
+
+ def with_row_index(self, name: str = "index") -> Self:
+ return self._with_compliant(self._compliant_frame.with_row_index(name))
+
+ def drop_nulls(self, subset: str | list[str] | None) -> Self:
+ subset = [subset] if isinstance(subset, str) else subset
+ return self._with_compliant(self._compliant_frame.drop_nulls(subset=subset))
+
+ @property
+ def columns(self) -> list[str]:
+ return self._compliant_frame.columns # type: ignore[no-any-return]
+
+ def with_columns(
+ self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
+ ) -> Self:
+ compliant_exprs, kinds = self._flatten_and_extract(*exprs, **named_exprs)
+ compliant_exprs = [
+ compliant_expr.broadcast(kind) if is_scalar_like(kind) else compliant_expr
+ for compliant_expr, kind in zip(compliant_exprs, kinds)
+ ]
+ return self._with_compliant(self._compliant_frame.with_columns(*compliant_exprs))
+
+ def select(
+ self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
+ ) -> Self:
+ flat_exprs = tuple(flatten(exprs))
+ if flat_exprs and all(isinstance(x, str) for x in flat_exprs) and not named_exprs:
+ # fast path!
+ try:
+ return self._with_compliant(
+ self._compliant_frame.simple_select(*flat_exprs)
+ )
+ except Exception as e:
+ # Column not found is the only thing that can realistically be raised here.
+ if error := self._compliant_frame._check_columns_exist(flat_exprs):
+ raise error from e
+ raise
+ compliant_exprs, kinds = self._flatten_and_extract(*flat_exprs, **named_exprs)
+ if compliant_exprs and all_exprs_are_scalar_like(*flat_exprs, **named_exprs):
+ return self._with_compliant(self._compliant_frame.aggregate(*compliant_exprs))
+ compliant_exprs = [
+ compliant_expr.broadcast(kind) if is_scalar_like(kind) else compliant_expr
+ for compliant_expr, kind in zip(compliant_exprs, kinds)
+ ]
+ return self._with_compliant(self._compliant_frame.select(*compliant_exprs))
+
+ def rename(self, mapping: dict[str, str]) -> Self:
+ return self._with_compliant(self._compliant_frame.rename(mapping))
+
+ def head(self, n: int) -> Self:
+ return self._with_compliant(self._compliant_frame.head(n))
+
+ def tail(self, n: int) -> Self:
+ return self._with_compliant(self._compliant_frame.tail(n))
+
+ def drop(self, *columns: Iterable[str], strict: bool) -> Self:
+ return self._with_compliant(self._compliant_frame.drop(columns, strict=strict))
+
+ def filter(
+ self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any
+ ) -> Self:
+ if len(predicates) == 1 and is_list_of(predicates[0], bool):
+ predicate = predicates[0]
+ else:
+ from narwhals.functions import col
+
+ flat_predicates = flatten(predicates)
+ check_expressions_preserve_length(*flat_predicates, function_name="filter")
+ plx = self.__narwhals_namespace__()
+ compliant_predicates, _kinds = self._flatten_and_extract(*flat_predicates)
+ compliant_constraints = (
+ (col(name) == v)._to_compliant_expr(plx)
+ for name, v in constraints.items()
+ )
+ predicate = plx.all_horizontal(
+ *chain(compliant_predicates, compliant_constraints)
+ )
+ return self._with_compliant(self._compliant_frame.filter(predicate))
+
+ def sort(
+ self,
+ by: str | Iterable[str],
+ *more_by: str,
+ descending: bool | Sequence[bool] = False,
+ nulls_last: bool = False,
+ ) -> Self:
+ by = flatten([*flatten([by]), *more_by])
+ return self._with_compliant(
+ self._compliant_frame.sort(*by, descending=descending, nulls_last=nulls_last)
+ )
+
+ def join(
+ self,
+ other: Self,
+ on: str | list[str] | None = None,
+ how: JoinStrategy = "inner",
+ *,
+ left_on: str | list[str] | None = None,
+ right_on: str | list[str] | None = None,
+ suffix: str = "_right",
+ ) -> Self:
+ on = [on] if isinstance(on, str) else on
+ left_on = [left_on] if isinstance(left_on, str) else left_on
+ right_on = [right_on] if isinstance(right_on, str) else right_on
+
+ if how not in (
+ _supported_joins := ("inner", "left", "full", "cross", "anti", "semi")
+ ):
+ msg = f"Only the following join strategies are supported: {_supported_joins}; found '{how}'."
+ raise NotImplementedError(msg)
+
+ if how == "cross" and (
+ left_on is not None or right_on is not None or on is not None
+ ):
+ msg = "Can not pass `left_on`, `right_on` or `on` keys for cross join"
+ raise ValueError(msg)
+
+ if how != "cross" and (on is None and (left_on is None or right_on is None)):
+ msg = f"Either (`left_on` and `right_on`) or `on` keys should be specified for {how}."
+ raise ValueError(msg)
+
+ if how != "cross" and (
+ on is not None and (left_on is not None or right_on is not None)
+ ):
+ msg = f"If `on` is specified, `left_on` and `right_on` should be None for {how}."
+ raise ValueError(msg)
+
+ if on is not None:
+ left_on = right_on = on
+
+ if (isinstance(left_on, list) and isinstance(right_on, list)) and (
+ len(left_on) != len(right_on)
+ ):
+ msg = "`left_on` and `right_on` must have the same length."
+ raise ValueError(msg)
+
+ return self._with_compliant(
+ self._compliant_frame.join(
+ self._extract_compliant(other),
+ how=how,
+ left_on=left_on,
+ right_on=right_on,
+ suffix=suffix,
+ )
+ )
+
+ def gather_every(self, n: int, offset: int = 0) -> Self:
+ return self._with_compliant(
+ self._compliant_frame.gather_every(n=n, offset=offset)
+ )
+
+ def join_asof( # noqa: C901
+ self,
+ other: Self,
+ *,
+ left_on: str | None = None,
+ right_on: str | None = None,
+ on: str | None = None,
+ by_left: str | list[str] | None = None,
+ by_right: str | list[str] | None = None,
+ by: str | list[str] | None = None,
+ strategy: AsofJoinStrategy = "backward",
+ suffix: str = "_right",
+ ) -> Self:
+ _supported_strategies = ("backward", "forward", "nearest")
+
+ if strategy not in _supported_strategies:
+ msg = f"Only the following strategies are supported: {_supported_strategies}; found '{strategy}'."
+ raise NotImplementedError(msg)
+
+ if (on is None) and (left_on is None or right_on is None):
+ msg = "Either (`left_on` and `right_on`) or `on` keys should be specified."
+ raise ValueError(msg)
+ if (on is not None) and (left_on is not None or right_on is not None):
+ msg = "If `on` is specified, `left_on` and `right_on` should be None."
+ raise ValueError(msg)
+ if (by is None) and (
+ (by_left is None and by_right is not None)
+ or (by_left is not None and by_right is None)
+ ):
+ msg = (
+ "Can not specify only `by_left` or `by_right`, you need to specify both."
+ )
+ raise ValueError(msg)
+ if (by is not None) and (by_left is not None or by_right is not None):
+ msg = "If `by` is specified, `by_left` and `by_right` should be None."
+ raise ValueError(msg)
+ if on is not None:
+ left_on = right_on = on
+ if by is not None:
+ by_left = by_right = by
+ if isinstance(by_left, str):
+ by_left = [by_left]
+ if isinstance(by_right, str):
+ by_right = [by_right]
+
+ if (isinstance(by_left, list) and isinstance(by_right, list)) and (
+ len(by_left) != len(by_right)
+ ):
+ msg = "`by_left` and `by_right` must have the same length."
+ raise ValueError(msg)
+
+ return self._with_compliant(
+ self._compliant_frame.join_asof(
+ self._extract_compliant(other),
+ left_on=left_on,
+ right_on=right_on,
+ by_left=by_left,
+ by_right=by_right,
+ strategy=strategy,
+ suffix=suffix,
+ )
+ )
+
+ def unpivot(
+ self,
+ on: str | list[str] | None,
+ *,
+ index: str | list[str] | None,
+ variable_name: str,
+ value_name: str,
+ ) -> Self:
+ on = [on] if isinstance(on, str) else on
+ index = [index] if isinstance(index, str) else index
+
+ return self._with_compliant(
+ self._compliant_frame.unpivot(
+ on=on, index=index, variable_name=variable_name, value_name=value_name
+ )
+ )
+
+ def __neq__(self, other: object) -> NoReturn:
+ msg = (
+ "DataFrame.__neq__ and LazyFrame.__neq__ are not implemented, please "
+ "use expressions instead.\n\n"
+ "Hint: instead of\n"
+ " df != 0\n"
+ "you may want to use\n"
+ " df.select(nw.all() != 0)"
+ )
+ raise NotImplementedError(msg)
+
+ def __eq__(self, other: object) -> NoReturn:
+ msg = (
+ "DataFrame.__eq__ and LazyFrame.__eq__ are not implemented, please "
+ "use expressions instead.\n\n"
+ "Hint: instead of\n"
+ " df == 0\n"
+ "you may want to use\n"
+ " df.select(nw.all() == 0)"
+ )
+ raise NotImplementedError(msg)
+
+ def explode(self, columns: str | Sequence[str], *more_columns: str) -> Self:
+ to_explode = (
+ [columns, *more_columns]
+ if isinstance(columns, str)
+ else [*columns, *more_columns]
+ )
+
+ return self._with_compliant(self._compliant_frame.explode(columns=to_explode))
+
+
+class DataFrame(BaseFrame[DataFrameT]):
+ """Narwhals DataFrame, backed by a native eager dataframe.
+
+ Warning:
+ This class is not meant to be instantiated directly - instead:
+
+ - If the native object is a eager dataframe from one of the supported
+ backend (e.g. pandas.DataFrame, polars.DataFrame, pyarrow.Table),
+ you can use [`narwhals.from_native`][]:
+ ```py
+ narwhals.from_native(native_dataframe)
+ narwhals.from_native(native_dataframe, eager_only=True)
+ ```
+
+ - If the object is a dictionary of column names and generic sequences mapping
+ (e.g. `dict[str, list]`), you can create a DataFrame via
+ [`narwhals.from_dict`][]:
+ ```py
+ narwhals.from_dict(
+ data={"a": [1, 2, 3]},
+ backend=narwhals.get_native_namespace(another_object),
+ )
+ ```
+ """
+
+ def _extract_compliant(self, arg: Any) -> Any:
+ from narwhals.expr import Expr
+ from narwhals.series import Series
+
+ plx: EagerNamespaceAny = self.__narwhals_namespace__()
+ if isinstance(arg, BaseFrame):
+ return arg._compliant_frame
+ if isinstance(arg, Series):
+ return arg._compliant_series._to_expr()
+ if isinstance(arg, Expr):
+ return arg._to_compliant_expr(self.__narwhals_namespace__())
+ if isinstance(arg, str):
+ return plx.col(arg)
+ if get_polars() is not None and "polars" in str(type(arg)): # pragma: no cover
+ msg = (
+ f"Expected Narwhals object, got: {type(arg)}.\n\n"
+ "Perhaps you:\n"
+ "- Forgot a `nw.from_native` somewhere?\n"
+ "- Used `pl.col` instead of `nw.col`?"
+ )
+ raise TypeError(msg)
+ if is_numpy_array(arg):
+ return plx._series.from_numpy(arg, context=plx)._to_expr()
+ raise InvalidIntoExprError.from_invalid_type(type(arg))
+
+ @property
+ def _series(self) -> type[Series[Any]]:
+ return Series
+
+ @property
+ def _lazyframe(self) -> type[LazyFrame[Any]]:
+ return LazyFrame
+
+ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> None:
+ self._level: Literal["full", "lazy", "interchange"] = level
+ # NOTE: Interchange support (`DataFrameLike`) is the source of the error
+ self._compliant_frame: CompliantDataFrame[Any, Any, DataFrameT, Self] # type: ignore[type-var]
+ if is_compliant_dataframe(df):
+ self._compliant_frame = df.__narwhals_dataframe__()
+ else: # pragma: no cover
+ msg = f"Expected an object which implements `__narwhals_dataframe__`, got: {type(df)}"
+ raise AssertionError(msg)
+
+ @property
+ def implementation(self) -> Implementation:
+ """Return implementation of native frame.
+
+ This can be useful when you need to use special-casing for features outside of
+ Narwhals' scope - for example, when dealing with pandas' Period Dtype.
+
+ Returns:
+ Implementation.
+
+ Examples:
+ >>> import narwhals as nw
+ >>> import pandas as pd
+ >>> df_native = pd.DataFrame({"a": [1, 2, 3]})
+ >>> df = nw.from_native(df_native)
+ >>> df.implementation
+ <Implementation.PANDAS: 'pandas'>
+ >>> df.implementation.is_pandas()
+ True
+ >>> df.implementation.is_pandas_like()
+ True
+ >>> df.implementation.is_polars()
+ False
+ """
+ return self._compliant_frame._implementation
+
+ def __len__(self) -> int:
+ return self._compliant_frame.__len__()
+
+ def __array__(self, dtype: Any = None, copy: bool | None = None) -> _2DArray: # noqa: FBT001
+ return self._compliant_frame.__array__(dtype, copy=copy)
+
+ def __repr__(self) -> str: # pragma: no cover
+ return generate_repr("Narwhals DataFrame", self.to_native().__repr__())
+
+ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
+ """Export a DataFrame via the Arrow PyCapsule Interface.
+
+ - if the underlying dataframe implements the interface, it'll return that
+ - else, it'll call `to_arrow` and then defer to PyArrow's implementation
+
+ See [PyCapsule Interface](https://arrow.apache.org/docs/dev/format/CDataInterface/PyCapsuleInterface.html)
+ for more.
+ """
+ native_frame = self._compliant_frame._native_frame
+ if supports_arrow_c_stream(native_frame):
+ return native_frame.__arrow_c_stream__(requested_schema=requested_schema)
+ try:
+ import pyarrow as pa # ignore-banned-import
+ except ModuleNotFoundError as exc: # pragma: no cover
+ msg = f"'pyarrow>=14.0.0' is required for `DataFrame.__arrow_c_stream__` for object of type {type(native_frame)}"
+ raise ModuleNotFoundError(msg) from exc
+ if parse_version(pa) < (14, 0): # pragma: no cover
+ msg = f"'pyarrow>=14.0.0' is required for `DataFrame.__arrow_c_stream__` for object of type {type(native_frame)}"
+ raise ModuleNotFoundError(msg) from None
+ pa_table = self.to_arrow()
+ return pa_table.__arrow_c_stream__(requested_schema=requested_schema) # type: ignore[no-untyped-call]
+
+ def lazy(
+ self, backend: ModuleType | Implementation | str | None = None
+ ) -> LazyFrame[Any]:
+ """Restrict available API methods to lazy-only ones.
+
+ If `backend` is specified, then a conversion between different backends
+ might be triggered.
+
+ If a library does not support lazy execution and `backend` is not specified,
+ then this is will only restrict the API to lazy-only operations. This is useful
+ if you want to ensure that you write dataframe-agnostic code which all has
+ the possibility of running entirely lazily.
+
+ Arguments:
+ backend: Which lazy backend collect to. This will be the underlying
+ backend for the resulting Narwhals LazyFrame. If not specified, and the
+ given library does not support lazy execution, then this will restrict
+ the API to lazy-only operations.
+
+ `backend` can be specified in various ways
+
+ - As `Implementation.<BACKEND>` with `BACKEND` being `DASK`, `DUCKDB`
+ or `POLARS`.
+ - As a string: `"dask"`, `"duckdb"` or `"polars"`
+ - Directly as a module `dask.dataframe`, `duckdb` or `polars`.
+
+ Returns:
+ A new LazyFrame.
+
+ Examples:
+ >>> import polars as pl
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pl.DataFrame({"a": [1, 2], "b": [4, 6]})
+ >>> df = nw.from_native(df_native)
+
+ If we call `df.lazy`, we get a `narwhals.LazyFrame` backed by a Polars
+ LazyFrame.
+
+ >>> df.lazy() # doctest: +SKIP
+ ┌─────────────────────────────┐
+ | Narwhals LazyFrame |
+ |-----------------------------|
+ |<LazyFrame at 0x7F52B9937230>|
+ └─────────────────────────────┘
+
+ We can also pass DuckDB as the backend, and then we'll get a
+ `narwhals.LazyFrame` backed by a `duckdb.DuckDBPyRelation`.
+
+ >>> df.lazy(backend=nw.Implementation.DUCKDB)
+ ┌──────────────────┐
+ |Narwhals LazyFrame|
+ |------------------|
+ |┌───────┬───────┐ |
+ |│ a │ b │ |
+ |│ int64 │ int64 │ |
+ |├───────┼───────┤ |
+ |│ 1 │ 4 │ |
+ |│ 2 │ 6 │ |
+ |└───────┴───────┘ |
+ └──────────────────┘
+ """
+ lazy_backend = None if backend is None else Implementation.from_backend(backend)
+ supported_lazy_backends = (
+ Implementation.DASK,
+ Implementation.DUCKDB,
+ Implementation.POLARS,
+ )
+ if lazy_backend is not None and lazy_backend not in supported_lazy_backends:
+ msg = (
+ "Not-supported backend."
+ f"\n\nExpected one of {supported_lazy_backends} or `None`, got {lazy_backend}"
+ )
+ raise ValueError(msg)
+ return self._lazyframe(
+ self._compliant_frame.lazy(backend=lazy_backend), level="lazy"
+ )
+
+ def to_native(self) -> DataFrameT:
+ """Convert Narwhals DataFrame to native one.
+
+ Returns:
+ Object of class that user started with.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame(
+ ... {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
+ ... )
+
+ Calling `to_native` on a Narwhals DataFrame returns the native object:
+
+ >>> nw.from_native(df_native).to_native()
+ foo bar ham
+ 0 1 6.0 a
+ 1 2 7.0 b
+ 2 3 8.0 c
+ """
+ return self._compliant_frame._native_frame
+
+ def to_pandas(self) -> pd.DataFrame:
+ """Convert this DataFrame to a pandas DataFrame.
+
+ Returns:
+ A pandas DataFrame.
+
+ Examples:
+ >>> import polars as pl
+ >>> import narwhals as nw
+ >>> df_native = pl.DataFrame(
+ ... {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
+ ... )
+ >>> df = nw.from_native(df_native)
+ >>> df.to_pandas()
+ foo bar ham
+ 0 1 6.0 a
+ 1 2 7.0 b
+ 2 3 8.0 c
+ """
+ return self._compliant_frame.to_pandas()
+
+ def to_polars(self) -> pl.DataFrame:
+ """Convert this DataFrame to a polars DataFrame.
+
+ Returns:
+ A polars DataFrame.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]})
+ >>> df = nw.from_native(df_native)
+ >>> df.to_polars()
+ shape: (2, 2)
+ ┌─────┬─────┐
+ │ foo ┆ bar │
+ │ --- ┆ --- │
+ │ i64 ┆ f64 │
+ ╞═════╪═════╡
+ │ 1 ┆ 6.0 │
+ │ 2 ┆ 7.0 │
+ └─────┴─────┘
+ """
+ return self._compliant_frame.to_polars()
+
+ @overload
+ def write_csv(self, file: None = None) -> str: ...
+
+ @overload
+ def write_csv(self, file: str | Path | BytesIO) -> None: ...
+
+ def write_csv(self, file: str | Path | BytesIO | None = None) -> str | None:
+ r"""Write dataframe to comma-separated values (CSV) file.
+
+ Arguments:
+ file: String, path object or file-like object to which the dataframe will be
+ written. If None, the resulting csv format is returned as a string.
+
+ Returns:
+ String or None.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame(
+ ... {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
+ ... )
+ >>> df = nw.from_native(df_native)
+ >>> df.write_csv()
+ 'foo,bar,ham\n1,6.0,a\n2,7.0,b\n3,8.0,c\n'
+
+ If we had passed a file name to `write_csv`, it would have been
+ written to that file.
+ """
+ return self._compliant_frame.write_csv(file)
+
+ def write_parquet(self, file: str | Path | BytesIO) -> None:
+ """Write dataframe to parquet file.
+
+ Arguments:
+ file: String, path object or file-like object to which the dataframe will be
+ written.
+
+ Returns:
+ None.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]})
+ >>> df = nw.from_native(df_native)
+ >>> df.write_parquet("out.parquet") # doctest:+SKIP
+ """
+ self._compliant_frame.write_parquet(file)
+
+ def to_numpy(self) -> _2DArray:
+ """Convert this DataFrame to a NumPy ndarray.
+
+ Returns:
+ A NumPy ndarray array.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"foo": [1, 2], "bar": [6.5, 7.0]})
+ >>> df = nw.from_native(df_native)
+ >>> df.to_numpy()
+ array([[1. , 6.5],
+ [2. , 7. ]])
+ """
+ return self._compliant_frame.to_numpy(None, copy=None)
+
+ @property
+ def shape(self) -> tuple[int, int]:
+ """Get the shape of the DataFrame.
+
+ Returns:
+ The shape of the dataframe as a tuple.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"foo": [1, 2]})
+ >>> df = nw.from_native(df_native)
+ >>> df.shape
+ (2, 1)
+ """
+ return self._compliant_frame.shape
+
+ def get_column(self, name: str) -> Series[Any]:
+ """Get a single column by name.
+
+ Arguments:
+ name: The column name as a string.
+
+ Returns:
+ A Narwhals Series, backed by a native series.
+
+ Notes:
+ Although `name` is typed as `str`, pandas does allow non-string column
+ names, and they will work when passed to this function if the
+ `narwhals.DataFrame` is backed by a pandas dataframe with non-string
+ columns. This function can only be used to extract a column by name, so
+ there is no risk of ambiguity.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"a": [1, 2]})
+ >>> df = nw.from_native(df_native)
+ >>> df.get_column("a").to_native()
+ 0 1
+ 1 2
+ Name: a, dtype: int64
+ """
+ return self._series(self._compliant_frame.get_column(name), level=self._level)
+
+ def estimated_size(self, unit: SizeUnit = "b") -> int | float:
+ """Return an estimation of the total (heap) allocated size of the `DataFrame`.
+
+ Estimated size is given in the specified unit (bytes by default).
+
+ Arguments:
+ unit: 'b', 'kb', 'mb', 'gb', 'tb', 'bytes', 'kilobytes', 'megabytes',
+ 'gigabytes', or 'terabytes'.
+
+ Returns:
+ Integer or Float.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]})
+ >>> df = nw.from_native(df_native)
+ >>> df.estimated_size()
+ 32
+ """
+ return self._compliant_frame.estimated_size(unit=unit)
+
+ # `str` overlaps with `Sequence[str]`
+ # We can ignore this but we must keep this overload ordering
+ @overload
+ def __getitem__(self, item: tuple[SingleIndexSelector, SingleColSelector]) -> Any: ...
+
+ @overload
+ def __getitem__( # type: ignore[overload-overlap]
+ self, item: str | tuple[MultiIndexSelector, SingleColSelector]
+ ) -> Series[Any]: ...
+
+ @overload
+ def __getitem__(
+ self,
+ item: (
+ SingleIndexSelector
+ | MultiIndexSelector
+ | MultiColSelector
+ | tuple[SingleIndexSelector, MultiColSelector]
+ | tuple[MultiIndexSelector, MultiColSelector]
+ ),
+ ) -> Self: ...
+ def __getitem__( # noqa: C901, PLR0912
+ self,
+ item: (
+ SingleIndexSelector
+ | SingleColSelector
+ | MultiColSelector
+ | MultiIndexSelector
+ | tuple[SingleIndexSelector, SingleColSelector]
+ | tuple[SingleIndexSelector, MultiColSelector]
+ | tuple[MultiIndexSelector, SingleColSelector]
+ | tuple[MultiIndexSelector, MultiColSelector]
+ ),
+ ) -> Series[Any] | Self | Any:
+ """Extract column or slice of DataFrame.
+
+ Arguments:
+ item: How to slice dataframe. What happens depends on what is passed. It's easiest
+ to explain by example. Suppose we have a Dataframe `df`
+
+ - `df['a']` extracts column `'a'` and returns a `Series`.
+ - `df[0:2]` extracts the first two rows and returns a `DataFrame`.
+ - `df[0:2, 'a']` extracts the first two rows from column `'a'` and returns
+ a `Series`.
+ - `df[0:2, 0]` extracts the first two rows from the first column and returns
+ a `Series`.
+ - `df[[0, 1], [0, 1, 2]]` extracts the first two rows and the first three columns
+ and returns a `DataFrame`
+ - `df[:, [0, 1, 2]]` extracts all rows from the first three columns and returns a
+ `DataFrame`.
+ - `df[:, ['a', 'c']]` extracts all rows and columns `'a'` and `'c'` and returns a
+ `DataFrame`.
+ - `df[['a', 'c']]` extracts all rows and columns `'a'` and `'c'` and returns a
+ `DataFrame`.
+ - `df[0: 2, ['a', 'c']]` extracts the first two rows and columns `'a'` and `'c'` and
+ returns a `DataFrame`
+ - `df[:, 0: 2]` extracts all rows from the first two columns and returns a `DataFrame`
+ - `df[:, 'a': 'c']` extracts all rows and all columns positioned between `'a'` and `'c'`
+ _inclusive_ and returns a `DataFrame`. For example, if the columns are
+ `'a', 'd', 'c', 'b'`, then that would extract columns `'a'`, `'d'`, and `'c'`.
+
+ Returns:
+ A Narwhals Series, backed by a native series.
+
+ Notes:
+ - Integers are always interpreted as positions
+ - Strings are always interpreted as column names.
+
+ In contrast with Polars, pandas allows non-string column names.
+ If you don't know whether the column name you're trying to extract
+ is definitely a string (e.g. `df[df.columns[0]]`) then you should
+ use `DataFrame.get_column` instead.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"a": [1, 2]})
+ >>> df = nw.from_native(df_native)
+ >>> df["a"].to_native()
+ 0 1
+ 1 2
+ Name: a, dtype: int64
+ """
+ from narwhals.series import Series
+
+ msg = (
+ f"Unexpected type for `DataFrame.__getitem__`, got: {type(item)}.\n\n"
+ "Hints:\n"
+ "- use `df.item` to select a single item.\n"
+ "- Use `df[indices, :]` to select rows positionally.\n"
+ "- Use `df.filter(mask)` to filter rows based on a boolean mask."
+ )
+
+ if isinstance(item, tuple):
+ if len(item) > 2:
+ tuple_msg = (
+ "Tuples cannot be passed to DataFrame.__getitem__ directly.\n\n"
+ "Hint: instead of `df[indices]`, did you mean `df[indices, :]`?"
+ )
+ raise TypeError(tuple_msg)
+ rows = None if not item or is_slice_none(item[0]) else item[0]
+ columns = None if len(item) < 2 or is_slice_none(item[1]) else item[1]
+ if rows is None and columns is None:
+ return self
+ elif is_index_selector(item):
+ rows = item
+ columns = None
+ elif is_sequence_like(item) or isinstance(item, (slice, str)):
+ rows = None
+ columns = item
+ else:
+ raise TypeError(msg)
+
+ if isinstance(rows, str):
+ raise TypeError(msg)
+
+ compliant = self._compliant_frame
+
+ if isinstance(columns, (int, str)):
+ if isinstance(rows, int):
+ return self.item(rows, columns)
+ col_name = columns if isinstance(columns, str) else self.columns[columns]
+ series = self.get_column(col_name)
+ return series[rows] if rows is not None else series
+ if isinstance(rows, Series):
+ rows = rows._compliant_series
+ if isinstance(columns, Series):
+ columns = columns._compliant_series
+ if rows is None:
+ return self._with_compliant(compliant[:, columns])
+ if columns is None:
+ return self._with_compliant(compliant[rows, :])
+ return self._with_compliant(compliant[rows, columns])
+
+ def __contains__(self, key: str) -> bool:
+ return key in self.columns
+
+ @overload
+ def to_dict(self, *, as_series: Literal[True] = ...) -> dict[str, Series[Any]]: ...
+ @overload
+ def to_dict(self, *, as_series: Literal[False]) -> dict[str, list[Any]]: ...
+ @overload
+ def to_dict(
+ self, *, as_series: bool
+ ) -> dict[str, Series[Any]] | dict[str, list[Any]]: ...
+ def to_dict(
+ self, *, as_series: bool = True
+ ) -> dict[str, Series[Any]] | dict[str, list[Any]]:
+ """Convert DataFrame to a dictionary mapping column name to values.
+
+ Arguments:
+ as_series: If set to true ``True``, then the values are Narwhals Series,
+ otherwise the values are Any.
+
+ Returns:
+ A mapping from column name to values / Series.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"A": [1, 2], "fruits": ["banana", "apple"]})
+ >>> df = nw.from_native(df_native)
+ >>> df.to_dict(as_series=False)
+ {'A': [1, 2], 'fruits': ['banana', 'apple']}
+ """
+ if as_series:
+ return {
+ key: self._series(value, level=self._level)
+ for key, value in self._compliant_frame.to_dict(
+ as_series=as_series
+ ).items()
+ }
+ return self._compliant_frame.to_dict(as_series=as_series)
+
+ def row(self, index: int) -> tuple[Any, ...]:
+ """Get values at given row.
+
+ Warning:
+ You should NEVER use this method to iterate over a DataFrame;
+ if you require row-iteration you should strongly prefer use of iter_rows()
+ instead.
+
+ Arguments:
+ index: Row number.
+
+ Returns:
+ A tuple of the values in the selected row.
+
+ Notes:
+ cuDF doesn't support this method.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"a": [1, 2], "b": [4, 5]})
+ >>> nw.from_native(df_native).row(1)
+ (<pyarrow.Int64Scalar: 2>, <pyarrow.Int64Scalar: 5>)
+ """
+ return self._compliant_frame.row(index)
+
+ # inherited
+ def pipe(
+ self,
+ function: Callable[Concatenate[Self, PS], R],
+ *args: PS.args,
+ **kwargs: PS.kwargs,
+ ) -> R:
+ """Pipe function call.
+
+ Arguments:
+ function: Function to apply.
+ args: Positional arguments to pass to function.
+ kwargs: Keyword arguments to pass to function.
+
+ Returns:
+ The original object with the function applied.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"a": [1, 2], "ba": [4, 5]})
+ >>> nw.from_native(df_native).pipe(
+ ... lambda _df: _df.select(
+ ... [x for x in _df.columns if len(x) == 1]
+ ... ).to_native()
+ ... )
+ a
+ 0 1
+ 1 2
+ """
+ return super().pipe(function, *args, **kwargs)
+
+ def drop_nulls(self, subset: str | list[str] | None = None) -> Self:
+ """Drop rows that contain null values.
+
+ Arguments:
+ subset: Column name(s) for which null values are considered. If set to None
+ (default), use all columns.
+
+ Returns:
+ The original object with the rows removed that contained the null values.
+
+ Notes:
+ pandas handles null values differently from Polars and PyArrow.
+ See [null_handling](../concepts/null_handling.md)
+ for reference.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"a": [1.0, None], "ba": [1.0, 2.0]})
+ >>> nw.from_native(df_native).drop_nulls().to_native()
+ pyarrow.Table
+ a: double
+ ba: double
+ ----
+ a: [[1]]
+ ba: [[1]]
+ """
+ return super().drop_nulls(subset=subset)
+
+ def with_row_index(self, name: str = "index") -> Self:
+ """Insert column which enumerates rows.
+
+ Arguments:
+ name: The name of the column as a string. The default is "index".
+
+ Returns:
+ The original object with the column added.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"a": [1, 2], "b": [4, 5]})
+ >>> nw.from_native(df_native).with_row_index().to_native()
+ pyarrow.Table
+ index: int64
+ a: int64
+ b: int64
+ ----
+ index: [[0,1]]
+ a: [[1,2]]
+ b: [[4,5]]
+ """
+ return super().with_row_index(name)
+
+ @property
+ def schema(self) -> Schema:
+ r"""Get an ordered mapping of column names to their data type.
+
+ Returns:
+ A Narwhals Schema object that displays the mapping of column names.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]})
+ >>> nw.from_native(df_native).schema
+ Schema({'foo': Int64, 'bar': Float64})
+ """
+ return super().schema
+
+ def collect_schema(self) -> Schema:
+ r"""Get an ordered mapping of column names to their data type.
+
+ Returns:
+ A Narwhals Schema object that displays the mapping of column names.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]})
+ >>> nw.from_native(df_native).collect_schema()
+ Schema({'foo': Int64, 'bar': Float64})
+ """
+ return super().collect_schema()
+
+ @property
+ def columns(self) -> list[str]:
+ """Get column names.
+
+ Returns:
+ The column names stored in a list.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]})
+ >>> nw.from_native(df_native).columns
+ ['foo', 'bar']
+ """
+ return super().columns
+
+ @overload
+ def rows(self, *, named: Literal[False] = False) -> list[tuple[Any, ...]]: ...
+
+ @overload
+ def rows(self, *, named: Literal[True]) -> list[dict[str, Any]]: ...
+
+ @overload
+ def rows(self, *, named: bool) -> list[tuple[Any, ...]] | list[dict[str, Any]]: ...
+
+ def rows(
+ self, *, named: bool = False
+ ) -> list[tuple[Any, ...]] | list[dict[str, Any]]:
+ """Returns all data in the DataFrame as a list of rows of python-native values.
+
+ Arguments:
+ named: By default, each row is returned as a tuple of values given
+ in the same order as the frame columns. Setting named=True will
+ return rows of dictionaries instead.
+
+ Returns:
+ The data as a list of rows.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]})
+ >>> nw.from_native(df_native).rows()
+ [(1, 6.0), (2, 7.0)]
+ """
+ return self._compliant_frame.rows(named=named) # type: ignore[return-value]
+
+ def iter_columns(self) -> Iterator[Series[Any]]:
+ """Returns an iterator over the columns of this DataFrame.
+
+ Yields:
+ A Narwhals Series, backed by a native series.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"foo": [1, 2], "bar": [6.0, 7.0]})
+ >>> iter_columns = nw.from_native(df_native).iter_columns()
+ >>> next(iter_columns)
+ ┌───────────────────────┐
+ | Narwhals Series |
+ |-----------------------|
+ |0 1 |
+ |1 2 |
+ |Name: foo, dtype: int64|
+ └───────────────────────┘
+ >>> next(iter_columns)
+ ┌─────────────────────────┐
+ | Narwhals Series |
+ |-------------------------|
+ |0 6.0 |
+ |1 7.0 |
+ |Name: bar, dtype: float64|
+ └─────────────────────────┘
+ """
+ for series in self._compliant_frame.iter_columns():
+ yield self._series(series, level=self._level)
+
+ @overload
+ def iter_rows(
+ self, *, named: Literal[False], buffer_size: int = ...
+ ) -> Iterator[tuple[Any, ...]]: ...
+
+ @overload
+ def iter_rows(
+ self, *, named: Literal[True], buffer_size: int = ...
+ ) -> Iterator[dict[str, Any]]: ...
+
+ @overload
+ def iter_rows(
+ self, *, named: bool, buffer_size: int = ...
+ ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: ...
+
+ def iter_rows(
+ self, *, named: bool = False, buffer_size: int = 512
+ ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]:
+ """Returns an iterator over the DataFrame of rows of python-native values.
+
+ Arguments:
+ named: By default, each row is returned as a tuple of values given
+ in the same order as the frame columns. Setting named=True will
+ return rows of dictionaries instead.
+ buffer_size: Determines the number of rows that are buffered
+ internally while iterating over the data.
+ See https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_rows.html
+
+ Returns:
+ An iterator over the DataFrame of rows.
+
+ Notes:
+ cuDF doesn't support this method.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, 2], "bar": [6.0, 7.0]})
+ >>> iter_rows = nw.from_native(df_native).iter_rows()
+ >>> next(iter_rows)
+ (1, 6.0)
+ >>> next(iter_rows)
+ (2, 7.0)
+ """
+ return self._compliant_frame.iter_rows(named=named, buffer_size=buffer_size) # type: ignore[return-value]
+
+ def with_columns(
+ self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
+ ) -> Self:
+ r"""Add columns to this DataFrame.
+
+ Added columns will replace existing columns with the same name.
+
+ Arguments:
+ *exprs: Column(s) to add, specified as positional arguments.
+ Accepts expression input. Strings are parsed as column names, other
+ non-expression inputs are parsed as literals.
+
+ **named_exprs: Additional columns to add, specified as keyword arguments.
+ The columns will be renamed to the keyword used.
+
+ Returns:
+ DataFrame: A new DataFrame with the columns added.
+
+ Note:
+ Creating a new DataFrame using this method does not create a new copy of
+ existing data.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"a": [1, 2], "b": [0.5, 4.0]})
+ >>> (
+ ... nw.from_native(df_native)
+ ... .with_columns((nw.col("a") * 2).alias("a*2"))
+ ... .to_native()
+ ... )
+ a b a*2
+ 0 1 0.5 2
+ 1 2 4.0 4
+ """
+ return super().with_columns(*exprs, **named_exprs)
+
+ def select(
+ self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
+ ) -> Self:
+ r"""Select columns from this DataFrame.
+
+ Arguments:
+ *exprs: Column(s) to select, specified as positional arguments.
+ Accepts expression input. Strings are parsed as column names,
+ other non-expression inputs are parsed as literals.
+
+ **named_exprs: Additional columns to select, specified as keyword arguments.
+ The columns will be renamed to the keyword used.
+
+ Returns:
+ The dataframe containing only the selected columns.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"a": [1, 2], "b": [3, 4]})
+ >>> nw.from_native(df_native).select("a", a_plus_1=nw.col("a") + 1)
+ ┌──────────────────┐
+ |Narwhals DataFrame|
+ |------------------|
+ |pyarrow.Table |
+ |a: int64 |
+ |a_plus_1: int64 |
+ |---- |
+ |a: [[1,2]] |
+ |a_plus_1: [[2,3]] |
+ └──────────────────┘
+ """
+ return super().select(*exprs, **named_exprs)
+
+ def rename(self, mapping: dict[str, str]) -> Self:
+ """Rename column names.
+
+ Arguments:
+ mapping: Key value pairs that map from old name to new name.
+
+ Returns:
+ The dataframe with the specified columns renamed.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, 2], "bar": [6, 7]})
+ >>> nw.from_native(df_native).rename({"foo": "apple"}).to_native()
+ pyarrow.Table
+ apple: int64
+ bar: int64
+ ----
+ apple: [[1,2]]
+ bar: [[6,7]]
+ """
+ return super().rename(mapping)
+
+ def head(self, n: int = 5) -> Self:
+ """Get the first `n` rows.
+
+ Arguments:
+ n: Number of rows to return. If a negative value is passed, return all rows
+ except the last `abs(n)`.
+
+ Returns:
+ A subset of the dataframe of shape (n, n_columns).
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"a": [1, 2], "b": [0.5, 4.0]})
+ >>> nw.from_native(df_native).head(1).to_native()
+ a b
+ 0 1 0.5
+ """
+ return super().head(n)
+
+ def tail(self, n: int = 5) -> Self:
+ """Get the last `n` rows.
+
+ Arguments:
+ n: Number of rows to return. If a negative value is passed, return all rows
+ except the first `abs(n)`.
+
+ Returns:
+ A subset of the dataframe of shape (n, n_columns).
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"a": [1, 2], "b": [0.5, 4.0]})
+ >>> nw.from_native(df_native).tail(1)
+ ┌──────────────────┐
+ |Narwhals DataFrame|
+ |------------------|
+ | a b |
+ | 1 2 4.0 |
+ └──────────────────┘
+ """
+ return super().tail(n)
+
+ def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self:
+ """Remove columns from the dataframe.
+
+ Returns:
+ The dataframe with the specified columns removed.
+
+ Arguments:
+ *columns: Names of the columns that should be removed from the dataframe.
+ strict: Validate that all column names exist in the schema and throw an
+ exception if a column name does not exist in the schema.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame(
+ ... {"foo": [1, 2], "bar": [6.0, 7.0], "ham": ["a", "b"]}
+ ... )
+ >>> nw.from_native(df_native).drop("ham").to_native()
+ foo bar
+ 0 1 6.0
+ 1 2 7.0
+ """
+ return super().drop(*flatten(columns), strict=strict)
+
+ def unique(
+ self,
+ subset: str | list[str] | None = None,
+ *,
+ keep: UniqueKeepStrategy = "any",
+ maintain_order: bool = False,
+ ) -> Self:
+ """Drop duplicate rows from this dataframe.
+
+ Arguments:
+ subset: Column name(s) to consider when identifying duplicate rows.
+ keep: {'first', 'last', 'any', 'none'}
+ Which of the duplicate rows to keep.
+
+ * 'any': Does not give any guarantee of which row is kept.
+ This allows more optimizations.
+ * 'none': Don't keep duplicate rows.
+ * 'first': Keep first unique row.
+ * 'last': Keep last unique row.
+ maintain_order: Keep the same order as the original DataFrame. This may be more
+ expensive to compute.
+
+ Returns:
+ The dataframe with the duplicate rows removed.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame(
+ ... {"foo": [1, 2], "bar": ["a", "a"], "ham": ["b", "b"]}
+ ... )
+ >>> nw.from_native(df_native).unique(["bar", "ham"]).to_native()
+ foo bar ham
+ 0 1 a b
+ """
+ if keep not in {"any", "none", "first", "last"}:
+ msg = f"Expected {'any', 'none', 'first', 'last'}, got: {keep}"
+ raise ValueError(msg)
+ if isinstance(subset, str):
+ subset = [subset]
+ return self._with_compliant(
+ self._compliant_frame.unique(subset, keep=keep, maintain_order=maintain_order)
+ )
+
+ def filter(
+ self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any
+ ) -> Self:
+ r"""Filter the rows in the DataFrame based on one or more predicate expressions.
+
+ The original order of the remaining rows is preserved.
+
+ Arguments:
+ *predicates: Expression(s) that evaluates to a boolean Series. Can
+ also be a (single!) boolean list.
+ **constraints: Column filters; use `name = value` to filter columns by the supplied value.
+ Each constraint will behave the same as `nw.col(name).eq(value)`, and will be implicitly
+ joined with the other filter conditions using &.
+
+ Returns:
+ The filtered dataframe.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame(
+ ... {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]}
+ ... )
+
+ Filter on one condition
+
+ >>> nw.from_native(df_native).filter(nw.col("foo") > 1).to_native()
+ foo bar ham
+ 1 2 7 b
+ 2 3 8 c
+
+ Filter on multiple conditions with implicit `&`
+
+ >>> nw.from_native(df_native).filter(
+ ... nw.col("foo") < 3, nw.col("ham") == "a"
+ ... ).to_native()
+ foo bar ham
+ 0 1 6 a
+
+ Filter on multiple conditions with `|`
+
+ >>> nw.from_native(df_native).filter(
+ ... (nw.col("foo") == 1) | (nw.col("ham") == "c")
+ ... ).to_native()
+ foo bar ham
+ 0 1 6 a
+ 2 3 8 c
+
+ Filter using `**kwargs` syntax
+
+ >>> nw.from_native(df_native).filter(foo=2, ham="b").to_native()
+ foo bar ham
+ 1 2 7 b
+ """
+ return super().filter(*predicates, **constraints)
+
+ @overload
+ def group_by(
+ self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: Literal[False] = ...
+ ) -> GroupBy[Self]: ...
+
+ @overload
+ def group_by(
+ self, *keys: str | Iterable[str], drop_null_keys: Literal[True]
+ ) -> GroupBy[Self]: ...
+
+ def group_by(
+ self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: bool = False
+ ) -> GroupBy[Self]:
+ r"""Start a group by operation.
+
+ Arguments:
+ *keys: Column(s) to group by. Accepts expression input. Strings are parsed as
+ column names.
+ drop_null_keys: if True, then groups where any key is null won't be included
+ in the result.
+
+ Returns:
+ GroupBy: Object which can be used to perform aggregations.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame(
+ ... {
+ ... "a": ["a", "b", "a", "b", "c"],
+ ... "b": [1, 2, 1, 3, 3],
+ ... "c": [5, 4, 3, 2, 1],
+ ... }
+ ... )
+
+ Group by one column and compute the sum of another column
+
+ >>> nw.from_native(df_native, eager_only=True).group_by("a").agg(
+ ... nw.col("b").sum()
+ ... ).sort("a").to_native()
+ a b
+ 0 a 2
+ 1 b 5
+ 2 c 3
+
+ Group by multiple columns and compute the max of another column
+
+ >>> (
+ ... nw.from_native(df_native, eager_only=True)
+ ... .group_by(["a", "b"])
+ ... .agg(nw.max("c"))
+ ... .sort("a", "b")
+ ... .to_native()
+ ... )
+ a b c
+ 0 a 1 5
+ 1 b 2 4
+ 2 b 3 2
+ 3 c 3 1
+
+ Expressions are also accepted.
+
+ >>> nw.from_native(df_native, eager_only=True).group_by(
+ ... "a", nw.col("b") // 2
+ ... ).agg(nw.col("c").mean()).to_native()
+ a b c
+ 0 a 0 4.0
+ 1 b 1 3.0
+ 2 c 1 1.0
+ """
+ from narwhals.group_by import GroupBy
+
+ flat_keys = flatten(keys)
+
+ if all(isinstance(key, str) for key in flat_keys):
+ return GroupBy(self, flat_keys, drop_null_keys=drop_null_keys)
+
+ from narwhals import col
+ from narwhals.expr import Expr
+ from narwhals.series import Series
+
+ key_is_expr_or_series = tuple(isinstance(k, (Expr, Series)) for k in flat_keys)
+
+ if drop_null_keys and any(key_is_expr_or_series):
+ msg = "drop_null_keys cannot be True when keys contains Expr or Series"
+ raise NotImplementedError(msg)
+
+ _keys = [
+ k if is_expr else col(k)
+ for k, is_expr in zip(flat_keys, key_is_expr_or_series)
+ ]
+ expr_flat_keys, kinds = self._flatten_and_extract(*_keys)
+
+ if not all(kind is ExprKind.ELEMENTWISE for kind in kinds):
+ from narwhals.exceptions import ComputeError
+
+ msg = (
+ "Group by is not supported with keys that are not elementwise expressions"
+ )
+ raise ComputeError(msg)
+
+ return GroupBy(self, expr_flat_keys, drop_null_keys=drop_null_keys)
+
+ def sort(
+ self,
+ by: str | Iterable[str],
+ *more_by: str,
+ descending: bool | Sequence[bool] = False,
+ nulls_last: bool = False,
+ ) -> Self:
+ r"""Sort the dataframe by the given columns.
+
+ Arguments:
+ by: Column(s) names to sort by.
+ *more_by: Additional columns to sort by, specified as positional arguments.
+ descending: Sort in descending order. When sorting by multiple columns, can be
+ specified per column by passing a sequence of booleans.
+ nulls_last: Place null values last.
+
+ Returns:
+ The sorted dataframe.
+
+ Note:
+ Unlike Polars, it is not possible to specify a sequence of booleans for
+ `nulls_last` in order to control per-column behaviour. Instead a single
+ boolean is applied for all `by` columns.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame(
+ ... {"foo": [2, 1], "bar": [6.0, 7.0], "ham": ["a", "b"]}
+ ... )
+ >>> nw.from_native(df_native).sort("foo")
+ ┌──────────────────┐
+ |Narwhals DataFrame|
+ |------------------|
+ | foo bar ham |
+ | 1 1 7.0 b |
+ | 0 2 6.0 a |
+ └──────────────────┘
+ """
+ return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last)
+
+ def join(
+ self,
+ other: Self,
+ on: str | list[str] | None = None,
+ how: JoinStrategy = "inner",
+ *,
+ left_on: str | list[str] | None = None,
+ right_on: str | list[str] | None = None,
+ suffix: str = "_right",
+ ) -> Self:
+ r"""Join in SQL-like fashion.
+
+ Arguments:
+ other: DataFrame to join with.
+ on: Name(s) of the join columns in both DataFrames. If set, `left_on` and
+ `right_on` should be None.
+ how: Join strategy.
+
+ * *inner*: Returns rows that have matching values in both tables.
+ * *left*: Returns all rows from the left table, and the matched rows from the right table.
+ * *full*: Returns all rows in both dataframes, with the suffix appended to the right join keys.
+ * *cross*: Returns the Cartesian product of rows from both tables.
+ * *semi*: Filter rows that have a match in the right table.
+ * *anti*: Filter rows that do not have a match in the right table.
+ left_on: Join column of the left DataFrame.
+ right_on: Join column of the right DataFrame.
+ suffix: Suffix to append to columns with a duplicate name.
+
+ Returns:
+ A new joined DataFrame
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_1_native = pd.DataFrame({"id": ["a", "b"], "price": [6.0, 7.0]})
+ >>> df_2_native = pd.DataFrame({"id": ["a", "b", "c"], "qty": [1, 2, 3]})
+ >>> nw.from_native(df_1_native).join(nw.from_native(df_2_native), on="id")
+ ┌──────────────────┐
+ |Narwhals DataFrame|
+ |------------------|
+ | id price qty |
+ | 0 a 6.0 1 |
+ | 1 b 7.0 2 |
+ └──────────────────┘
+ """
+ return super().join(
+ other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix
+ )
+
+ def join_asof(
+ self,
+ other: Self,
+ *,
+ left_on: str | None = None,
+ right_on: str | None = None,
+ on: str | None = None,
+ by_left: str | list[str] | None = None,
+ by_right: str | list[str] | None = None,
+ by: str | list[str] | None = None,
+ strategy: AsofJoinStrategy = "backward",
+ suffix: str = "_right",
+ ) -> Self:
+ """Perform an asof join.
+
+ This is similar to a left-join except that we match on nearest key rather than equal keys.
+
+ For Polars, both DataFrames must be sorted by the `on` key (within each `by` group
+ if specified).
+
+ Arguments:
+ other: DataFrame to join with.
+ left_on: Name(s) of the left join column(s).
+ right_on: Name(s) of the right join column(s).
+ on: Join column of both DataFrames. If set, left_on and right_on should be None.
+ by_left: join on these columns before doing asof join.
+ by_right: join on these columns before doing asof join.
+ by: join on these columns before doing asof join.
+ strategy: Join strategy. The default is "backward".
+ suffix: Suffix to append to columns with a duplicate name.
+
+ * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key.
+ * *forward*: selects the first row in the right DataFrame whose "on" key is greater than or equal to the left's key.
+ * *nearest*: search selects the last row in the right DataFrame whose value is nearest to the left's key.
+
+ Returns:
+ A new joined DataFrame
+
+ Examples:
+ >>> from datetime import datetime
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> data_gdp = {
+ ... "datetime": [
+ ... datetime(2016, 1, 1),
+ ... datetime(2017, 1, 1),
+ ... datetime(2018, 1, 1),
+ ... datetime(2019, 1, 1),
+ ... datetime(2020, 1, 1),
+ ... ],
+ ... "gdp": [4164, 4411, 4566, 4696, 4827],
+ ... }
+ >>> data_population = {
+ ... "datetime": [
+ ... datetime(2016, 3, 1),
+ ... datetime(2018, 8, 1),
+ ... datetime(2019, 1, 1),
+ ... ],
+ ... "population": [82.19, 82.66, 83.12],
+ ... }
+ >>> gdp_native = pd.DataFrame(data_gdp)
+ >>> population_native = pd.DataFrame(data_population)
+ >>> gdp = nw.from_native(gdp_native)
+ >>> population = nw.from_native(population_native)
+ >>> population.join_asof(gdp, on="datetime", strategy="backward")
+ ┌──────────────────────────────┐
+ | Narwhals DataFrame |
+ |------------------------------|
+ | datetime population gdp|
+ |0 2016-03-01 82.19 4164|
+ |1 2018-08-01 82.66 4566|
+ |2 2019-01-01 83.12 4696|
+ └──────────────────────────────┘
+ """
+ return super().join_asof(
+ other,
+ left_on=left_on,
+ right_on=right_on,
+ on=on,
+ by_left=by_left,
+ by_right=by_right,
+ by=by,
+ strategy=strategy,
+ suffix=suffix,
+ )
+
+ # --- descriptive ---
+ def is_duplicated(self) -> Series[Any]:
+ r"""Get a mask of all duplicated rows in this DataFrame.
+
+ Returns:
+ A new Series.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"foo": [2, 2, 2], "bar": [6.0, 6.0, 7.0]})
+ >>> nw.from_native(df_native).is_duplicated()
+ ┌───────────────┐
+ |Narwhals Series|
+ |---------------|
+ | 0 True |
+ | 1 True |
+ | 2 False |
+ | dtype: bool |
+ └───────────────┘
+ """
+ return ~self.is_unique()
+
+ def is_empty(self) -> bool:
+ r"""Check if the dataframe is empty.
+
+ Returns:
+ A boolean indicating whether the dataframe is empty (True) or not (False).
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"foo": [2, 2, 2], "bar": [6.0, 6.0, 7.0]})
+ >>> nw.from_native(df_native).is_empty()
+ False
+ """
+ return len(self) == 0
+
+ def is_unique(self) -> Series[Any]:
+ r"""Get a mask of all unique rows in this DataFrame.
+
+ Returns:
+ A new Series.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"foo": [2, 2, 2], "bar": [6.0, 6.0, 7.0]})
+ >>> nw.from_native(df_native).is_unique()
+ ┌───────────────┐
+ |Narwhals Series|
+ |---------------|
+ | 0 False |
+ | 1 False |
+ | 2 True |
+ | dtype: bool |
+ └───────────────┘
+ """
+ return self._series(self._compliant_frame.is_unique(), level=self._level)
+
+ def null_count(self) -> Self:
+ r"""Create a new DataFrame that shows the null counts per column.
+
+ Returns:
+ A dataframe of shape (1, n_columns).
+
+ Notes:
+ pandas handles null values differently from Polars and PyArrow.
+ See [null_handling](../concepts/null_handling.md/)
+ for reference.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, None], "bar": [2, 3]})
+ >>> nw.from_native(df_native).null_count()
+ ┌──────────────────┐
+ |Narwhals DataFrame|
+ |------------------|
+ | pyarrow.Table |
+ | foo: int64 |
+ | bar: int64 |
+ | ---- |
+ | foo: [[1]] |
+ | bar: [[0]] |
+ └──────────────────┘
+ """
+ plx = self._compliant_frame.__narwhals_namespace__()
+ result = self._compliant_frame.select(plx.all().null_count())
+ return self._with_compliant(result)
+
+ def item(self, row: int | None = None, column: int | str | None = None) -> Any:
+ r"""Return the DataFrame as a scalar, or return the element at the given row/column.
+
+ Arguments:
+ row: The *n*-th row.
+ column: The column selected via an integer or a string (column name).
+
+ Returns:
+ A scalar or the specified element in the dataframe.
+
+ Notes:
+ If row/col not provided, this is equivalent to df[0,0], with a check that the shape is (1,1).
+ With row/col, this is equivalent to df[row,col].
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, None], "bar": [2, 3]})
+ >>> nw.from_native(df_native).item(0, 1)
+ 2
+ """
+ return self._compliant_frame.item(row=row, column=column)
+
+ def clone(self) -> Self:
+ r"""Create a copy of this DataFrame.
+
+ Returns:
+ An identical copy of the original dataframe.
+ """
+ return self._with_compliant(self._compliant_frame.clone())
+
+ def gather_every(self, n: int, offset: int = 0) -> Self:
+ r"""Take every nth row in the DataFrame and return as a new DataFrame.
+
+ Arguments:
+ n: Gather every *n*-th row.
+ offset: Starting index.
+
+ Returns:
+ The dataframe containing only the selected rows.
+
+ Examples:
+ >>> import pyarrow as pa
+ >>> import narwhals as nw
+ >>> df_native = pa.table({"foo": [1, None, 2, 3]})
+ >>> nw.from_native(df_native).gather_every(2)
+ ┌──────────────────┐
+ |Narwhals DataFrame|
+ |------------------|
+ | pyarrow.Table |
+ | foo: int64 |
+ | ---- |
+ | foo: [[1,2]] |
+ └──────────────────┘
+ """
+ return super().gather_every(n=n, offset=offset)
+
+ def pivot(
+ self,
+ on: str | list[str],
+ *,
+ index: str | list[str] | None = None,
+ values: str | list[str] | None = None,
+ aggregate_function: PivotAgg | None = None,
+ maintain_order: bool | None = None,
+ sort_columns: bool = False,
+ separator: str = "_",
+ ) -> Self:
+ r"""Create a spreadsheet-style pivot table as a DataFrame.
+
+ Arguments:
+ on: Name of the column(s) whose values will be used as the header of the
+ output DataFrame.
+ index: One or multiple keys to group by. If None, all remaining columns not
+ specified on `on` and `values` will be used. At least one of `index` and
+ `values` must be specified.
+ values: One or multiple keys to group by. If None, all remaining columns not
+ specified on `on` and `index` will be used. At least one of `index` and
+ `values` must be specified.
+ aggregate_function: Choose from
+
+ - None: no aggregation takes place, will raise error if multiple values
+ are in group.
+ - A predefined aggregate function string, one of
+ {'min', 'max', 'first', 'last', 'sum', 'mean', 'median', 'len'}
+ maintain_order: Has no effect and is kept around only for backwards-compatibility.
+ sort_columns: Sort the transposed columns by name. Default is by order of
+ discovery.
+ separator: Used as separator/delimiter in generated column names in case of
+ multiple `values` columns.
+
+ Returns:
+ A new dataframe.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> data = {
+ ... "ix": [1, 1, 2, 2, 1, 2],
+ ... "col": ["a", "a", "a", "a", "b", "b"],
+ ... "foo": [0, 1, 2, 2, 7, 1],
+ ... "bar": [0, 2, 0, 0, 9, 4],
+ ... }
+ >>> df_native = pd.DataFrame(data)
+ >>> nw.from_native(df_native).pivot(
+ ... "col", index="ix", aggregate_function="sum"
+ ... )
+ ┌─────────────────────────────────┐
+ | Narwhals DataFrame |
+ |---------------------------------|
+ | ix foo_a foo_b bar_a bar_b|
+ |0 1 1 7 2 9|
+ |1 2 4 1 0 4|
+ └─────────────────────────────────┘
+ """
+ if values is None and index is None:
+ msg = "At least one of `values` and `index` must be passed"
+ raise ValueError(msg)
+ if maintain_order is not None:
+ msg = (
+ "`maintain_order` has no effect and is only kept around for backwards-compatibility. "
+ "You can safely remove this argument."
+ )
+ warn(message=msg, category=UserWarning, stacklevel=find_stacklevel())
+ on = [on] if isinstance(on, str) else on
+ values = [values] if isinstance(values, str) else values
+ index = [index] if isinstance(index, str) else index
+
+ return self._with_compliant(
+ self._compliant_frame.pivot(
+ on=on,
+ index=index,
+ values=values,
+ aggregate_function=aggregate_function,
+ sort_columns=sort_columns,
+ separator=separator,
+ )
+ )
+
+ def to_arrow(self) -> pa.Table:
+ r"""Convert to arrow table.
+
+ Returns:
+ A new PyArrow table.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"foo": [1, None], "bar": [2, 3]})
+ >>> nw.from_native(df_native).to_arrow()
+ pyarrow.Table
+ foo: double
+ bar: int64
+ ----
+ foo: [[1,null]]
+ bar: [[2,3]]
+ """
+ return self._compliant_frame.to_arrow()
+
+ def sample(
+ self,
+ n: int | None = None,
+ *,
+ fraction: float | None = None,
+ with_replacement: bool = False,
+ seed: int | None = None,
+ ) -> Self:
+ r"""Sample from this DataFrame.
+
+ Arguments:
+ n: Number of items to return. Cannot be used with fraction.
+ fraction: Fraction of items to return. Cannot be used with n.
+ with_replacement: Allow values to be sampled more than once.
+ seed: Seed for the random number generator. If set to None (default), a random
+ seed is generated for each sample operation.
+
+ Returns:
+ A new dataframe.
+
+ Notes:
+ The results may not be consistent across libraries.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> df_native = pd.DataFrame({"foo": [1, 2, 3], "bar": [19, 32, 4]})
+ >>> nw.from_native(df_native).sample(n=2) # doctest:+SKIP
+ ┌──────────────────┐
+ |Narwhals DataFrame|
+ |------------------|
+ | foo bar |
+ | 2 3 4 |
+ | 1 2 32 |
+ └──────────────────┘
+ """
+ return self._with_compliant(
+ self._compliant_frame.sample(
+ n=n, fraction=fraction, with_replacement=with_replacement, seed=seed
+ )
+ )
+
+ def unpivot(
+ self,
+ on: str | list[str] | None = None,
+ *,
+ index: str | list[str] | None = None,
+ variable_name: str = "variable",
+ value_name: str = "value",
+ ) -> Self:
+ r"""Unpivot a DataFrame from wide to long format.
+
+ Optionally leaves identifiers set.
+
+ This function is useful to massage a DataFrame into a format where one or more
+ columns are identifier variables (index) while all other columns, considered
+ measured variables (on), are "unpivoted" to the row axis leaving just
+ two non-identifier columns, 'variable' and 'value'.
+
+ Arguments:
+ on: Column(s) to use as values variables; if `on` is empty all columns that
+ are not in `index` will be used.
+ index: Column(s) to use as identifier variables.
+ variable_name: Name to give to the `variable` column. Defaults to "variable".
+ value_name: Name to give to the `value` column. Defaults to "value".
+
+ Returns:
+ The unpivoted dataframe.
+
+ Notes:
+ If you're coming from pandas, this is similar to `pandas.DataFrame.melt`,
+ but with `index` replacing `id_vars` and `on` replacing `value_vars`.
+ In other frameworks, you might know this operation as `pivot_longer`.
+
+ Examples:
+ >>> import pandas as pd
+ >>> import narwhals as nw
+ >>> data = {"a": ["x", "y", "z"], "b": [1, 3, 5], "c": [2, 4, 6]}
+ >>> df_native = pd.DataFrame(data)
+ >>> nw.from_native(df_native).unpivot(["b", "c"], index="a")
+ ┌────────────────────┐
+ | Narwhals DataFrame |
+ |--------------------|
+ | a variable value|
+ |0 x b 1|
+ |1 y b 3|
+ |2 z b 5|
+ |3 x c 2|
+ |4 y c 4|
+ |5 z c 6|
+ └────────────────────┘
+ """
+ return super().unpivot(
+ on=on, index=index, variable_name=variable_name, value_name=value_name
+ )
+
+ def explode(self, columns: str | Sequence[str], *more_columns: str) -> Self:
+ """Explode the dataframe to long format by exploding the given columns.
+
+ Notes:
+ It is possible to explode multiple columns only if these columns must have
+ matching element counts.
+
+ Arguments:
+ columns: Column names. The underlying columns being exploded must be of the `List` data type.
+ *more_columns: Additional names of columns to explode, specified as positional arguments.
+
+ Returns:
+ New DataFrame
+
+ Examples:
+ >>> import polars as pl
+ >>> import narwhals as nw
+ >>> data = {"a": ["x", "y"], "b": [[1, 2], [3]]}
+ >>> df_native = pl.DataFrame(data)
+ >>> nw.from_native(df_native).explode("b").to_native()
+ shape: (3, 2)
+ ┌─────┬─────┐
+ │ a ┆ b │
+ │ --- ┆ --- │
+ │ str ┆ i64 │
+ ╞═════╪═════╡
+ │ x ┆ 1 │
+ │ x ┆ 2 │
+ │ y ┆ 3 │
+ └─────┴─────┘
+ """
+ return super().explode(columns, *more_columns)
+
+
+class LazyFrame(BaseFrame[FrameT]):
+ """Narwhals LazyFrame, backed by a native lazyframe.
+
+ Warning:
+ This class is not meant to be instantiated directly - instead use
+ [`narwhals.from_native`][] with a native
+ object that is a lazy dataframe from one of the supported
+ backend (e.g. polars.LazyFrame, dask_expr._collection.DataFrame):
+ ```py
+ narwhals.from_native(native_lazyframe)
+ ```
+ """
+
+ def _extract_compliant(self, arg: Any) -> Any:
+ from narwhals.expr import Expr
+ from narwhals.series import Series
+
+ if isinstance(arg, BaseFrame):
+ return arg._compliant_frame
+ if isinstance(arg, Series): # pragma: no cover
+ msg = "Binary operations between Series and LazyFrame are not supported."
+ raise TypeError(msg)
+ if isinstance(arg, str): # pragma: no cover
+ plx = self.__narwhals_namespace__()
+ return plx.col(arg)
+ if isinstance(arg, Expr):
+ if arg._metadata.n_orderable_ops:
+ msg = (
+ "Order-dependent expressions are not supported for use in LazyFrame.\n\n"
+ "Hint: To make the expression valid, use `.over` with `order_by` specified.\n\n"
+ "For example, if you wrote `nw.col('price').cum_sum()` and you have a column\n"
+ "`'date'` which orders your data, then replace:\n\n"
+ " nw.col('price').cum_sum()\n\n"
+ " with:\n\n"
+ " nw.col('price').cum_sum().over(order_by='date')\n"
+ " ^^^^^^^^^^^^^^^^^^^^^^\n\n"
+ "See https://narwhals-dev.github.io/narwhals/concepts/order_dependence/."
+ )
+ raise OrderDependentExprError(msg)
+ if arg._metadata.is_filtration:
+ msg = (
+ "Length-changing expressions are not supported for use in LazyFrame, unless\n"
+ "followed by an aggregation.\n\n"
+ "Hints:\n"
+ "- Instead of `lf.select(nw.col('a').head())`, use `lf.select('a').head()\n"
+ "- Instead of `lf.select(nw.col('a').drop_nulls()).select(nw.sum('a'))`,\n"
+ " use `lf.select(nw.col('a').drop_nulls().sum())\n"
+ )
+ raise LengthChangingExprError(msg)
+ return arg._to_compliant_expr(self.__narwhals_namespace__())
+ if get_polars() is not None and "polars" in str(type(arg)): # pragma: no cover
+ msg = (
+ f"Expected Narwhals object, got: {type(arg)}.\n\n"
+ "Perhaps you:\n"
+ "- Forgot a `nw.from_native` somewhere?\n"
+ "- Used `pl.col` instead of `nw.col`?"
+ )
+ raise TypeError(msg)
+ raise InvalidIntoExprError.from_invalid_type(type(arg)) # pragma: no cover
+
+ @property
+ def _dataframe(self) -> type[DataFrame[Any]]:
+ return DataFrame
+
+ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> None:
+ self._level = level
+ self._compliant_frame: CompliantLazyFrame[Any, FrameT, Self] # type: ignore[type-var]
+ if is_compliant_lazyframe(df):
+ self._compliant_frame = df.__narwhals_lazyframe__()
+ else: # pragma: no cover
+ msg = f"Expected Polars LazyFrame or an object that implements `__narwhals_lazyframe__`, got: {type(df)}"
+ raise AssertionError(msg)
+
+ def __repr__(self) -> str: # pragma: no cover
+ return generate_repr("Narwhals LazyFrame", self.to_native().__repr__())
+
+ @property
+ def implementation(self) -> Implementation:
+ """Return implementation of native frame.
+
+ This can be useful when you need to use special-casing for features outside of
+ Narwhals' scope - for example, when dealing with pandas' Period Dtype.
+
+ Returns:
+ Implementation.
+
+ Examples:
+ >>> import narwhals as nw
+ >>> import dask.dataframe as dd
+ >>> lf_native = dd.from_dict({"a": [1, 2]}, npartitions=1)
+ >>> nw.from_native(lf_native).implementation
+ <Implementation.DASK: 'dask'>
+ """
+ return self._compliant_frame._implementation
+
+ def __getitem__(self, item: str | slice) -> NoReturn:
+ msg = "Slicing is not supported on LazyFrame"
+ raise TypeError(msg)
+
+ def collect(
+ self, backend: ModuleType | Implementation | str | None = None, **kwargs: Any
+ ) -> DataFrame[Any]:
+ r"""Materialize this LazyFrame into a DataFrame.
+
+ As each underlying lazyframe has different arguments to set when materializing
+ the lazyframe into a dataframe, we allow to pass them as kwargs (see examples
+ below for how to generalize the specification).
+
+ Arguments:
+ backend: specifies which eager backend collect to. This will be the underlying
+ backend for the resulting Narwhals DataFrame. If None, then the following
+ default conversions will be applied
+
+ - `polars.LazyFrame` -> `polars.DataFrame`
+ - `dask.DataFrame` -> `pandas.DataFrame`
+ - `duckdb.PyRelation` -> `pyarrow.Table`
+ - `pyspark.DataFrame` -> `pyarrow.Table`
+
+ `backend` can be specified in various ways
+
+ - As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`
+ or `POLARS`.
+ - As a string: `"pandas"`, `"pyarrow"` or `"polars"`
+ - Directly as a module `pandas`, `pyarrow` or `polars`.
+ kwargs: backend specific kwargs to pass along. To know more please check the
+ backend specific documentation
+
+ - [polars.LazyFrame.collect](https://docs.pola.rs/api/python/dev/reference/lazyframe/api/polars.LazyFrame.collect.html)
+ - [dask.dataframe.DataFrame.compute](https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.compute.html)
+
+ Returns:
+ DataFrame
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)")
+ >>> lf = nw.from_native(lf_native)
+ >>> lf
+ ┌──────────────────┐
+ |Narwhals LazyFrame|
+ |------------------|
+ |┌───────┬───────┐ |
+ |│ a │ b │ |
+ |│ int32 │ int32 │ |
+ |├───────┼───────┤ |
+ |│ 1 │ 2 │ |
+ |│ 3 │ 4 │ |
+ |└───────┴───────┘ |
+ └──────────────────┘
+ >>> lf.collect()
+ ┌──────────────────┐
+ |Narwhals DataFrame|
+ |------------------|
+ | pyarrow.Table |
+ | a: int32 |
+ | b: int32 |
+ | ---- |
+ | a: [[1,3]] |
+ | b: [[2,4]] |
+ └──────────────────┘
+ """
+ eager_backend = None if backend is None else Implementation.from_backend(backend)
+ supported_eager_backends = (
+ Implementation.POLARS,
+ Implementation.PANDAS,
+ Implementation.PYARROW,
+ )
+ if eager_backend is not None and eager_backend not in supported_eager_backends:
+ msg = f"Unsupported `backend` value.\nExpected one of {supported_eager_backends} or None, got: {eager_backend}."
+ raise ValueError(msg)
+ return self._dataframe(
+ self._compliant_frame.collect(backend=eager_backend, **kwargs), level="full"
+ )
+
+ def to_native(self) -> FrameT:
+ """Convert Narwhals LazyFrame to native one.
+
+ Returns:
+ Object of class that user started with.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)")
+ >>> nw.from_native(lf_native).to_native()
+ ┌───────┬───────┐
+ │ a │ b │
+ │ int32 │ int32 │
+ ├───────┼───────┤
+ │ 1 │ 2 │
+ │ 3 │ 4 │
+ └───────┴───────┘
+ <BLANKLINE>
+ """
+ return to_native(narwhals_object=self, pass_through=False)
+
+ # inherited
+ def pipe(
+ self,
+ function: Callable[Concatenate[Self, PS], R],
+ *args: PS.args,
+ **kwargs: PS.kwargs,
+ ) -> R:
+ """Pipe function call.
+
+ Arguments:
+ function: Function to apply.
+ args: Positional arguments to pass to function.
+ kwargs: Keyword arguments to pass to function.
+
+ Returns:
+ The original object with the function applied.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)")
+ >>> nw.from_native(lf_native).pipe(lambda x: x.select("a")).to_native()
+ ┌───────┐
+ │ a │
+ │ int32 │
+ ├───────┤
+ │ 1 │
+ │ 3 │
+ └───────┘
+ <BLANKLINE>
+ """
+ return super().pipe(function, *args, **kwargs)
+
+ def drop_nulls(self, subset: str | list[str] | None = None) -> Self:
+ """Drop rows that contain null values.
+
+ Arguments:
+ subset: Column name(s) for which null values are considered. If set to None
+ (default), use all columns.
+
+ Returns:
+ The original object with the rows removed that contained the null values.
+
+ Notes:
+ pandas handles null values differently from Polars and PyArrow.
+ See [null_handling](../concepts/null_handling.md/)
+ for reference.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, NULL), (3, 4) df(a, b)")
+ >>> nw.from_native(lf_native).drop_nulls()
+ ┌──────────────────┐
+ |Narwhals LazyFrame|
+ |------------------|
+ |┌───────┬───────┐ |
+ |│ a │ b │ |
+ |│ int32 │ int32 │ |
+ |├───────┼───────┤ |
+ |│ 3 │ 4 │ |
+ |└───────┴───────┘ |
+ └──────────────────┘
+ """
+ return super().drop_nulls(subset=subset)
+
+ def with_row_index(self, name: str = "index") -> Self:
+ """Insert column which enumerates rows.
+
+ Arguments:
+ name: The name of the column as a string. The default is "index".
+
+ Returns:
+ The original object with the column added.
+
+ Examples:
+ >>> import dask.dataframe as dd
+ >>> import narwhals as nw
+ >>> lf_native = dd.from_dict({"a": [1, 2], "b": [4, 5]}, npartitions=1)
+ >>> nw.from_native(lf_native).with_row_index().collect()
+ ┌──────────────────┐
+ |Narwhals DataFrame|
+ |------------------|
+ | index a b |
+ | 0 0 1 4 |
+ | 1 1 2 5 |
+ └──────────────────┘
+ """
+ return super().with_row_index(name)
+
+ @property
+ def schema(self) -> Schema:
+ r"""Get an ordered mapping of column names to their data type.
+
+ Returns:
+ A Narwhals Schema object that displays the mapping of column names.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)")
+ >>> nw.from_native(lf_native).schema
+ Schema({'a': Int32, 'b': Decimal})
+ """
+ return super().schema
+
+ def collect_schema(self) -> Schema:
+ r"""Get an ordered mapping of column names to their data type.
+
+ Returns:
+ A Narwhals Schema object that displays the mapping of column names.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)")
+ >>> nw.from_native(lf_native).collect_schema()
+ Schema({'a': Int32, 'b': Decimal})
+ """
+ return super().collect_schema()
+
+ @property
+ def columns(self) -> list[str]:
+ r"""Get column names.
+
+ Returns:
+ The column names stored in a list.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)")
+ >>> nw.from_native(lf_native).columns
+ ['a', 'b']
+ """
+ return super().columns
+
+ def with_columns(
+ self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
+ ) -> Self:
+ r"""Add columns to this LazyFrame.
+
+ Added columns will replace existing columns with the same name.
+
+ Arguments:
+ *exprs: Column(s) to add, specified as positional arguments.
+ Accepts expression input. Strings are parsed as column names, other
+ non-expression inputs are parsed as literals.
+
+ **named_exprs: Additional columns to add, specified as keyword arguments.
+ The columns will be renamed to the keyword used.
+
+ Returns:
+ LazyFrame: A new LazyFrame with the columns added.
+
+ Note:
+ Creating a new LazyFrame using this method does not create a new copy of
+ existing data.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)")
+ >>> nw.from_native(lf_native).with_columns(c=nw.col("a") + 1)
+ ┌────────────────────────────────┐
+ | Narwhals LazyFrame |
+ |--------------------------------|
+ |┌───────┬──────────────┬───────┐|
+ |│ a │ b │ c │|
+ |│ int32 │ decimal(2,1) │ int32 │|
+ |├───────┼──────────────┼───────┤|
+ |│ 1 │ 4.5 │ 2 │|
+ |│ 3 │ 2.0 │ 4 │|
+ |└───────┴──────────────┴───────┘|
+ └────────────────────────────────┘
+ """
+ if not exprs and not named_exprs:
+ msg = "At least one expression must be passed to LazyFrame.with_columns"
+ raise ValueError(msg)
+ return super().with_columns(*exprs, **named_exprs)
+
+ def select(
+ self, *exprs: IntoExpr | Iterable[IntoExpr], **named_exprs: IntoExpr
+ ) -> Self:
+ r"""Select columns from this LazyFrame.
+
+ Arguments:
+ *exprs: Column(s) to select, specified as positional arguments.
+ Accepts expression input. Strings are parsed as column names.
+ **named_exprs: Additional columns to select, specified as keyword arguments.
+ The columns will be renamed to the keyword used.
+
+ Returns:
+ The LazyFrame containing only the selected columns.
+
+ Notes:
+ If you'd like to select a column whose name isn't a string (for example,
+ if you're working with pandas) then you should explicitly use `nw.col` instead
+ of just passing the column name. For example, to select a column named
+ `0` use `df.select(nw.col(0))`, not `df.select(0)`.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)")
+ >>> nw.from_native(lf_native).select("a", a_plus_1=nw.col("a") + 1)
+ ┌────────────────────┐
+ | Narwhals LazyFrame |
+ |--------------------|
+ |┌───────┬──────────┐|
+ |│ a │ a_plus_1 │|
+ |│ int32 │ int32 │|
+ |├───────┼──────────┤|
+ |│ 1 │ 2 │|
+ |│ 3 │ 4 │|
+ |└───────┴──────────┘|
+ └────────────────────┘
+ """
+ if not exprs and not named_exprs:
+ msg = "At least one expression must be passed to LazyFrame.select"
+ raise ValueError(msg)
+ return super().select(*exprs, **named_exprs)
+
+ def rename(self, mapping: dict[str, str]) -> Self:
+ r"""Rename column names.
+
+ Arguments:
+ mapping: Key value pairs that map from old name to new name, or a
+ function that takes the old name as input and returns the
+ new name.
+
+ Returns:
+ The LazyFrame with the specified columns renamed.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 4.5), (3, 2.) df(a, b)")
+ >>> nw.from_native(lf_native).rename({"a": "c"})
+ ┌────────────────────────┐
+ | Narwhals LazyFrame |
+ |------------------------|
+ |┌───────┬──────────────┐|
+ |│ c │ b │|
+ |│ int32 │ decimal(2,1) │|
+ |├───────┼──────────────┤|
+ |│ 1 │ 4.5 │|
+ |│ 3 │ 2.0 │|
+ |└───────┴──────────────┘|
+ └────────────────────────┘
+ """
+ return super().rename(mapping)
+
+ def head(self, n: int = 5) -> Self:
+ r"""Get `n` rows.
+
+ Arguments:
+ n: Number of rows to return.
+
+ Returns:
+ A subset of the LazyFrame of shape (n, n_columns).
+
+ Examples:
+ >>> import dask.dataframe as dd
+ >>> import narwhals as nw
+ >>> lf_native = dd.from_dict({"a": [1, 2, 3], "b": [4, 5, 6]}, npartitions=1)
+ >>> nw.from_native(lf_native).head(2).collect()
+ ┌──────────────────┐
+ |Narwhals DataFrame|
+ |------------------|
+ | a b |
+ | 0 1 4 |
+ | 1 2 5 |
+ └──────────────────┘
+ """
+ return super().head(n)
+
+ def tail(self, n: int = 5) -> Self: # pragma: no cover
+ r"""Get the last `n` rows.
+
+ Warning:
+ `LazyFrame.tail` is deprecated and will be removed in a future version.
+ Note: this will remain available in `narwhals.stable.v1`.
+ See [stable api](../backcompat.md/) for more information.
+
+ Arguments:
+ n: Number of rows to return.
+
+ Returns:
+ A subset of the LazyFrame of shape (n, n_columns).
+ """
+ return super().tail(n)
+
+ def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self:
+ r"""Remove columns from the LazyFrame.
+
+ Arguments:
+ *columns: Names of the columns that should be removed from the dataframe.
+ strict: Validate that all column names exist in the schema and throw an
+ exception if a column name does not exist in the schema.
+
+ Returns:
+ The LazyFrame with the specified columns removed.
+
+ Warning:
+ `strict` argument is ignored for `polars<1.0.0`.
+
+ Please consider upgrading to a newer version or pass to eager mode.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 2), (3, 4) df(a, b)")
+ >>> nw.from_native(lf_native).drop("a").to_native()
+ ┌───────┐
+ │ b │
+ │ int32 │
+ ├───────┤
+ │ 2 │
+ │ 4 │
+ └───────┘
+ <BLANKLINE>
+ """
+ return super().drop(*flatten(columns), strict=strict)
+
+ def unique(
+ self,
+ subset: str | list[str] | None = None,
+ *,
+ keep: LazyUniqueKeepStrategy = "any",
+ ) -> Self:
+ """Drop duplicate rows from this LazyFrame.
+
+ Arguments:
+ subset: Column name(s) to consider when identifying duplicate rows.
+ If set to `None`, use all columns.
+ keep: {'any', 'none'}
+ Which of the duplicate rows to keep.
+
+ * 'any': Does not give any guarantee of which row is kept.
+ * 'none': Don't keep duplicate rows.
+
+ Returns:
+ The LazyFrame with unique rows.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> lf_native = duckdb.sql("SELECT * FROM VALUES (1, 1), (3, 4) df(a, b)")
+ >>> nw.from_native(lf_native).unique("a").sort("a", descending=True)
+ ┌──────────────────┐
+ |Narwhals LazyFrame|
+ |------------------|
+ |┌───────┬───────┐ |
+ |│ a │ b │ |
+ |│ int32 │ int32 │ |
+ |├───────┼───────┤ |
+ |│ 3 │ 4 │ |
+ |│ 1 │ 1 │ |
+ |└───────┴───────┘ |
+ └──────────────────┘
+ """
+ if keep not in {"any", "none"}:
+ msg = (
+ "narwhals.LazyFrame makes no assumptions about row order, so only "
+ f"'any' and 'none' are supported for `keep` in `unique`. Got: {keep}."
+ )
+ raise ValueError(msg)
+ if isinstance(subset, str):
+ subset = [subset]
+ return self._with_compliant(
+ self._compliant_frame.unique(subset=subset, keep=keep)
+ )
+
+ def filter(
+ self, *predicates: IntoExpr | Iterable[IntoExpr] | list[bool], **constraints: Any
+ ) -> Self:
+ r"""Filter the rows in the LazyFrame based on a predicate expression.
+
+ The original order of the remaining rows is preserved.
+
+ Arguments:
+ *predicates: Expression that evaluates to a boolean Series. Can
+ also be a (single!) boolean list.
+ **constraints: Column filters; use `name = value` to filter columns by the supplied value.
+ Each constraint will behave the same as `nw.col(name).eq(value)`, and will be implicitly
+ joined with the other filter conditions using &.
+
+ Returns:
+ The filtered LazyFrame.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> df_native = duckdb.sql('''
+ ... SELECT * FROM VALUES
+ ... (1, 6, 'a'),
+ ... (2, 7, 'b'),
+ ... (3, 8, 'c')
+ ... df(foo, bar, ham)
+ ... ''')
+
+ Filter on one condition
+
+ >>> nw.from_native(df_native).filter(nw.col("foo") > 1).to_native()
+ ┌───────┬───────┬─────────┐
+ │ foo │ bar │ ham │
+ │ int32 │ int32 │ varchar │
+ ├───────┼───────┼─────────┤
+ │ 2 │ 7 │ b │
+ │ 3 │ 8 │ c │
+ └───────┴───────┴─────────┘
+ <BLANKLINE>
+
+ Filter on multiple conditions with implicit `&`
+
+ >>> nw.from_native(df_native).filter(
+ ... nw.col("foo") < 3, nw.col("ham") == "a"
+ ... ).to_native()
+ ┌───────┬───────┬─────────┐
+ │ foo │ bar │ ham │
+ │ int32 │ int32 │ varchar │
+ ├───────┼───────┼─────────┤
+ │ 1 │ 6 │ a │
+ └───────┴───────┴─────────┘
+ <BLANKLINE>
+
+ Filter on multiple conditions with `|`
+
+ >>> nw.from_native(df_native).filter(
+ ... (nw.col("foo") == 1) | (nw.col("ham") == "c")
+ ... ).to_native()
+ ┌───────┬───────┬─────────┐
+ │ foo │ bar │ ham │
+ │ int32 │ int32 │ varchar │
+ ├───────┼───────┼─────────┤
+ │ 1 │ 6 │ a │
+ │ 3 │ 8 │ c │
+ └───────┴───────┴─────────┘
+ <BLANKLINE>
+
+ Filter using `**kwargs` syntax
+
+ >>> nw.from_native(df_native).filter(foo=2, ham="b").to_native()
+ ┌───────┬───────┬─────────┐
+ │ foo │ bar │ ham │
+ │ int32 │ int32 │ varchar │
+ ├───────┼───────┼─────────┤
+ │ 2 │ 7 │ b │
+ └───────┴───────┴─────────┘
+ <BLANKLINE>
+ """
+ if (
+ len(predicates) == 1 and is_list_of(predicates[0], bool) and not constraints
+ ): # pragma: no cover
+ msg = "`LazyFrame.filter` is not supported with Python boolean masks - use expressions instead."
+ raise TypeError(msg)
+
+ return super().filter(*predicates, **constraints)
+
+ @overload
+ def group_by(
+ self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: Literal[False] = ...
+ ) -> LazyGroupBy[Self]: ...
+
+ @overload
+ def group_by(
+ self, *keys: str | Iterable[str], drop_null_keys: Literal[True]
+ ) -> LazyGroupBy[Self]: ...
+
+ def group_by(
+ self, *keys: IntoExpr | Iterable[IntoExpr], drop_null_keys: bool = False
+ ) -> LazyGroupBy[Self]:
+ r"""Start a group by operation.
+
+ Arguments:
+ *keys: Column(s) to group by. Accepts expression input. Strings are parsed as
+ column names.
+ drop_null_keys: if True, then groups where any key is null won't be
+ included in the result.
+
+ Returns:
+ Object which can be used to perform aggregations.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> df_native = duckdb.sql(
+ ... "SELECT * FROM VALUES (1, 'a'), (2, 'b'), (3, 'a') df(a, b)"
+ ... )
+ >>> df = nw.from_native(df_native)
+ >>> df.group_by("b").agg(nw.col("a").sum()).sort("b").to_native()
+ ┌─────────┬────────┐
+ │ b │ a │
+ │ varchar │ int128 │
+ ├─────────┼────────┤
+ │ a │ 4 │
+ │ b │ 2 │
+ └─────────┴────────┘
+ <BLANKLINE>
+
+ Expressions are also accepted.
+
+ >>> df.group_by(nw.col("b").str.len_chars()).agg(
+ ... nw.col("a").sum()
+ ... ).to_native()
+ ┌───────┬────────┐
+ │ b │ a │
+ │ int64 │ int128 │
+ ├───────┼────────┤
+ │ 1 │ 6 │
+ └───────┴────────┘
+ <BLANKLINE>
+ """
+ from narwhals.group_by import LazyGroupBy
+
+ flat_keys = flatten(keys)
+
+ if all(isinstance(key, str) for key in flat_keys):
+ return LazyGroupBy(self, flat_keys, drop_null_keys=drop_null_keys)
+
+ from narwhals import col
+ from narwhals.expr import Expr
+
+ key_is_expr = tuple(isinstance(k, Expr) for k in flat_keys)
+
+ if drop_null_keys and any(key_is_expr):
+ msg = "drop_null_keys cannot be True when keys contains Expr"
+ raise NotImplementedError(msg)
+
+ _keys = [k if is_expr else col(k) for k, is_expr in zip(flat_keys, key_is_expr)]
+ expr_flat_keys, kinds = self._flatten_and_extract(*_keys)
+
+ if not all(kind is ExprKind.ELEMENTWISE for kind in kinds):
+ from narwhals.exceptions import ComputeError
+
+ msg = (
+ "Group by is not supported with keys that are not elementwise expressions"
+ )
+ raise ComputeError(msg)
+
+ return LazyGroupBy(self, expr_flat_keys, drop_null_keys=drop_null_keys)
+
+ def sort(
+ self,
+ by: str | Iterable[str],
+ *more_by: str,
+ descending: bool | Sequence[bool] = False,
+ nulls_last: bool = False,
+ ) -> Self:
+ r"""Sort the LazyFrame by the given columns.
+
+ Arguments:
+ by: Column(s) names to sort by.
+ *more_by: Additional columns to sort by, specified as positional arguments.
+ descending: Sort in descending order. When sorting by multiple columns, can be
+ specified per column by passing a sequence of booleans.
+ nulls_last: Place null values last; can specify a single boolean applying to
+ all columns or a sequence of booleans for per-column control.
+
+ Returns:
+ The sorted LazyFrame.
+
+ Warning:
+ Unlike Polars, it is not possible to specify a sequence of booleans for
+ `nulls_last` in order to control per-column behaviour. Instead a single
+ boolean is applied for all `by` columns.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> df_native = duckdb.sql(
+ ... "SELECT * FROM VALUES (1, 6.0, 'a'), (2, 5.0, 'c'), (NULL, 4.0, 'b') df(a, b, c)"
+ ... )
+ >>> df = nw.from_native(df_native)
+ >>> df.sort("a")
+ ┌──────────────────────────────────┐
+ | Narwhals LazyFrame |
+ |----------------------------------|
+ |┌───────┬──────────────┬─────────┐|
+ |│ a │ b │ c │|
+ |│ int32 │ decimal(2,1) │ varchar │|
+ |├───────┼──────────────┼─────────┤|
+ |│ NULL │ 4.0 │ b │|
+ |│ 1 │ 6.0 │ a │|
+ |│ 2 │ 5.0 │ c │|
+ |└───────┴──────────────┴─────────┘|
+ └──────────────────────────────────┘
+ """
+ return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last)
+
+ def join(
+ self,
+ other: Self,
+ on: str | list[str] | None = None,
+ how: JoinStrategy = "inner",
+ *,
+ left_on: str | list[str] | None = None,
+ right_on: str | list[str] | None = None,
+ suffix: str = "_right",
+ ) -> Self:
+ r"""Add a join operation to the Logical Plan.
+
+ Arguments:
+ other: Lazy DataFrame to join with.
+ on: Name(s) of the join columns in both DataFrames. If set, `left_on` and
+ `right_on` should be None.
+ how: Join strategy.
+
+ * *inner*: Returns rows that have matching values in both tables.
+ * *left*: Returns all rows from the left table, and the matched rows from the right table.
+ * *full*: Returns all rows in both dataframes, with the suffix appended to the right join keys.
+ * *cross*: Returns the Cartesian product of rows from both tables.
+ * *semi*: Filter rows that have a match in the right table.
+ * *anti*: Filter rows that do not have a match in the right table.
+ left_on: Join column of the left DataFrame.
+ right_on: Join column of the right DataFrame.
+ suffix: Suffix to append to columns with a duplicate name.
+
+ Returns:
+ A new joined LazyFrame.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> df_native1 = duckdb.sql(
+ ... "SELECT * FROM VALUES (1, 'a'), (2, 'b') df(a, b)"
+ ... )
+ >>> df_native2 = duckdb.sql(
+ ... "SELECT * FROM VALUES (1, 'x'), (3, 'y') df(a, c)"
+ ... )
+ >>> df1 = nw.from_native(df_native1)
+ >>> df2 = nw.from_native(df_native2)
+ >>> df1.join(df2, on="a")
+ ┌─────────────────────────────┐
+ | Narwhals LazyFrame |
+ |-----------------------------|
+ |┌───────┬─────────┬─────────┐|
+ |│ a │ b │ c │|
+ |│ int32 │ varchar │ varchar │|
+ |├───────┼─────────┼─────────┤|
+ |│ 1 │ a │ x │|
+ |└───────┴─────────┴─────────┘|
+ └─────────────────────────────┘
+ """
+ return super().join(
+ other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix
+ )
+
+ def join_asof(
+ self,
+ other: Self,
+ *,
+ left_on: str | None = None,
+ right_on: str | None = None,
+ on: str | None = None,
+ by_left: str | list[str] | None = None,
+ by_right: str | list[str] | None = None,
+ by: str | list[str] | None = None,
+ strategy: AsofJoinStrategy = "backward",
+ suffix: str = "_right",
+ ) -> Self:
+ """Perform an asof join.
+
+ This is similar to a left-join except that we match on nearest key rather than equal keys.
+
+ For Polars, both DataFrames must be sorted by the `on` key (within each `by` group
+ if specified).
+
+ Arguments:
+ other: DataFrame to join with.
+ left_on: Name(s) of the left join column(s).
+ right_on: Name(s) of the right join column(s).
+ on: Join column of both DataFrames. If set, left_on and right_on should be None.
+ by_left: join on these columns before doing asof join
+ by_right: join on these columns before doing asof join
+ by: join on these columns before doing asof join
+ strategy: Join strategy. The default is "backward".
+
+ * *backward*: selects the last row in the right DataFrame whose "on" key is less than or equal to the left's key.
+ * *forward*: selects the first row in the right DataFrame whose "on" key is greater than or equal to the left's key.
+ * *nearest*: search selects the last row in the right DataFrame whose value is nearest to the left's key.
+
+ suffix: Suffix to append to columns with a duplicate name.
+
+ Returns:
+ A new joined LazyFrame.
+
+ Examples:
+ >>> from datetime import datetime
+ >>> import polars as pl
+ >>> import narwhals as nw
+ >>> data_gdp = {
+ ... "datetime": [
+ ... datetime(2016, 1, 1),
+ ... datetime(2017, 1, 1),
+ ... datetime(2018, 1, 1),
+ ... datetime(2019, 1, 1),
+ ... datetime(2020, 1, 1),
+ ... ],
+ ... "gdp": [4164, 4411, 4566, 4696, 4827],
+ ... }
+ >>> data_population = {
+ ... "datetime": [
+ ... datetime(2016, 3, 1),
+ ... datetime(2018, 8, 1),
+ ... datetime(2019, 1, 1),
+ ... ],
+ ... "population": [82.19, 82.66, 83.12],
+ ... }
+ >>> gdp_native = pl.DataFrame(data_gdp)
+ >>> population_native = pl.DataFrame(data_population)
+ >>> gdp = nw.from_native(gdp_native)
+ >>> population = nw.from_native(population_native)
+ >>> population.join_asof(gdp, on="datetime", strategy="backward").to_native()
+ shape: (3, 3)
+ ┌─────────────────────┬────────────┬──────┐
+ │ datetime ┆ population ┆ gdp │
+ │ --- ┆ --- ┆ --- │
+ │ datetime[μs] ┆ f64 ┆ i64 │
+ ╞═════════════════════╪════════════╪══════╡
+ │ 2016-03-01 00:00:00 ┆ 82.19 ┆ 4164 │
+ │ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │
+ │ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │
+ └─────────────────────┴────────────┴──────┘
+ """
+ return super().join_asof(
+ other,
+ left_on=left_on,
+ right_on=right_on,
+ on=on,
+ by_left=by_left,
+ by_right=by_right,
+ by=by,
+ strategy=strategy,
+ suffix=suffix,
+ )
+
+ def lazy(self) -> Self:
+ """Restrict available API methods to lazy-only ones.
+
+ This is a no-op, and exists only for compatibility with `DataFrame.lazy`.
+
+ Returns:
+ A LazyFrame.
+ """
+ return self
+
+ def gather_every(self, n: int, offset: int = 0) -> Self:
+ r"""Take every nth row in the DataFrame and return as a new DataFrame.
+
+ Warning:
+ `LazyFrame.gather_every` is deprecated and will be removed in a future version.
+ Note: this will remain available in `narwhals.stable.v1`.
+ See [stable api](../backcompat.md/) for more information.
+
+ Arguments:
+ n: Gather every *n*-th row.
+ offset: Starting index.
+
+ Returns:
+ The LazyFrame containing only the selected rows.
+ """
+ msg = (
+ "`LazyFrame.gather_every` is deprecated and will be removed in a future version.\n\n"
+ "Note: this will remain available in `narwhals.stable.v1`.\n"
+ "See https://narwhals-dev.github.io/narwhals/backcompat/ for more information.\n"
+ )
+ issue_deprecation_warning(msg, _version="1.29.0")
+
+ return super().gather_every(n=n, offset=offset)
+
+ def unpivot(
+ self,
+ on: str | list[str] | None = None,
+ *,
+ index: str | list[str] | None = None,
+ variable_name: str = "variable",
+ value_name: str = "value",
+ ) -> Self:
+ r"""Unpivot a DataFrame from wide to long format.
+
+ Optionally leaves identifiers set.
+
+ This function is useful to massage a DataFrame into a format where one or more
+ columns are identifier variables (index) while all other columns, considered
+ measured variables (on), are "unpivoted" to the row axis leaving just
+ two non-identifier columns, 'variable' and 'value'.
+
+ Arguments:
+ on: Column(s) to use as values variables; if `on` is empty all columns that
+ are not in `index` will be used.
+ index: Column(s) to use as identifier variables.
+ variable_name: Name to give to the `variable` column. Defaults to "variable".
+ value_name: Name to give to the `value` column. Defaults to "value".
+
+ Returns:
+ The unpivoted LazyFrame.
+
+ Notes:
+ If you're coming from pandas, this is similar to `pandas.DataFrame.melt`,
+ but with `index` replacing `id_vars` and `on` replacing `value_vars`.
+ In other frameworks, you might know this operation as `pivot_longer`.
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> df_native = duckdb.sql(
+ ... "SELECT * FROM VALUES ('x', 1, 2), ('y', 3, 4), ('z', 5, 6) df(a, b, c)"
+ ... )
+ >>> df = nw.from_native(df_native)
+ >>> df.unpivot(on=["b", "c"], index="a").sort("a", "variable").to_native()
+ ┌─────────┬──────────┬───────┐
+ │ a │ variable │ value │
+ │ varchar │ varchar │ int32 │
+ ├─────────┼──────────┼───────┤
+ │ x │ b │ 1 │
+ │ x │ c │ 2 │
+ │ y │ b │ 3 │
+ │ y │ c │ 4 │
+ │ z │ b │ 5 │
+ │ z │ c │ 6 │
+ └─────────┴──────────┴───────┘
+ <BLANKLINE>
+ """
+ return super().unpivot(
+ on=on, index=index, variable_name=variable_name, value_name=value_name
+ )
+
+ def explode(self, columns: str | Sequence[str], *more_columns: str) -> Self:
+ """Explode the dataframe to long format by exploding the given columns.
+
+ Notes:
+ It is possible to explode multiple columns only if these columns have
+ matching element counts.
+
+ Arguments:
+ columns: Column names. The underlying columns being exploded must be of the `List` data type.
+ *more_columns: Additional names of columns to explode, specified as positional arguments.
+
+ Returns:
+ New LazyFrame
+
+ Examples:
+ >>> import duckdb
+ >>> import narwhals as nw
+ >>> df_native = duckdb.sql(
+ ... "SELECT * FROM VALUES ('x', [1, 2]), ('y', [3, 4]), ('z', [5, 6]) df(a, b)"
+ ... )
+ >>> df = nw.from_native(df_native)
+ >>> df.explode("b").to_native()
+ ┌─────────┬───────┐
+ │ a │ b │
+ │ varchar │ int32 │
+ ├─────────┼───────┤
+ │ x │ 1 │
+ │ x │ 2 │
+ │ y │ 3 │
+ │ y │ 4 │
+ │ z │ 5 │
+ │ z │ 6 │
+ └─────────┴───────┘
+ <BLANKLINE>
+ """
+ return super().explode(columns, *more_columns)