add code for analysis of data

author: sotech117 <michael_foiani@brown.edu> 2025-07-31 17:27:24 -0400
committer: sotech117 <michael_foiani@brown.edu> 2025-07-31 17:27:24 -0400
commit: 5bf22fc7e3c392c8bd44315ca2d06d7dca7d084e (patch)
tree: 8dacb0f195df1c0788d36dd0064f6bbaa3143ede /venv/lib/python3.8/site-packages/narwhals/expr_str.py
parent: b832d364da8c2efe09e3f75828caf73c50d01ce3 (diff)
1 files changed, 449 insertions, 0 deletions
diff --git a/venv/lib/python3.8/site-packages/narwhals/expr_str.py b/venv/lib/python3.8/site-packages/narwhals/expr_str.py
new file mode 100644
index 0000000..e598ff7
--- /dev/null
+++ b/venv/lib/python3.8/site-packages/narwhals/expr_str.py
@@ -0,0 +1,449 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Generic, TypeVar
+
+if TYPE_CHECKING:
+    from narwhals.expr import Expr
+
+ExprT = TypeVar("ExprT", bound="Expr")
+
+
+class ExprStringNamespace(Generic[ExprT]):
+    def __init__(self, expr: ExprT) -> None:
+        self._expr = expr
+
+    def len_chars(self) -> ExprT:
+        r"""Return the length of each string as the number of characters.
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import polars as pl
+            >>> import narwhals as nw
+            >>> df_native = pl.DataFrame({"words": ["foo", "345", None]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(words_len=nw.col("words").str.len_chars())
+            ┌─────────────────────┐
+            | Narwhals DataFrame  |
+            |---------------------|
+            |shape: (3, 2)        |
+            |┌───────┬───────────┐|
+            |│ words ┆ words_len │|
+            |│ ---   ┆ ---       │|
+            |│ str   ┆ u32       │|
+            |╞═══════╪═══════════╡|
+            |│ foo   ┆ 3         │|
+            |│ 345   ┆ 3         │|
+            |│ null  ┆ null      │|
+            |└───────┴───────────┘|
+            └─────────────────────┘
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.len_chars()
+        )
+
+    def replace(
+        self, pattern: str, value: str, *, literal: bool = False, n: int = 1
+    ) -> ExprT:
+        r"""Replace first matching regex/literal substring with a new string value.
+
+        Arguments:
+            pattern: A valid regular expression pattern.
+            value: String that will replace the matched substring.
+            literal: Treat `pattern` as a literal string.
+            n: Number of matches to replace.
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import narwhals as nw
+            >>> df_native = pd.DataFrame({"foo": ["123abc", "abc abc123"]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(replaced=nw.col("foo").str.replace("abc", ""))
+            ┌──────────────────────┐
+            |  Narwhals DataFrame  |
+            |----------------------|
+            |          foo replaced|
+            |0      123abc      123|
+            |1  abc abc123   abc123|
+            └──────────────────────┘
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.replace(
+                pattern, value, literal=literal, n=n
+            )
+        )
+
+    def replace_all(self, pattern: str, value: str, *, literal: bool = False) -> ExprT:
+        r"""Replace all matching regex/literal substring with a new string value.
+
+        Arguments:
+            pattern: A valid regular expression pattern.
+            value: String that will replace the matched substring.
+            literal: Treat `pattern` as a literal string.
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import narwhals as nw
+            >>> df_native = pd.DataFrame({"foo": ["123abc", "abc abc123"]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(replaced=nw.col("foo").str.replace_all("abc", ""))
+            ┌──────────────────────┐
+            |  Narwhals DataFrame  |
+            |----------------------|
+            |          foo replaced|
+            |0      123abc      123|
+            |1  abc abc123      123|
+            └──────────────────────┘
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.replace_all(
+                pattern, value, literal=literal
+            )
+        )
+
+    def strip_chars(self, characters: str | None = None) -> ExprT:
+        r"""Remove leading and trailing characters.
+
+        Arguments:
+            characters: The set of characters to be removed. All combinations of this
+                set of characters will be stripped from the start and end of the string.
+                If set to None (default), all leading and trailing whitespace is removed
+                instead.
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import polars as pl
+            >>> import narwhals as nw
+            >>> df_native = pl.DataFrame({"fruits": ["apple", "\nmango"]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(stripped=nw.col("fruits").str.strip_chars()).to_dict(
+            ...     as_series=False
+            ... )
+            {'fruits': ['apple', '\nmango'], 'stripped': ['apple', 'mango']}
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.strip_chars(characters)
+        )
+
+    def starts_with(self, prefix: str) -> ExprT:
+        r"""Check if string values start with a substring.
+
+        Arguments:
+            prefix: prefix substring
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import narwhals as nw
+            >>> df_native = pd.DataFrame({"fruits": ["apple", "mango", None]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(has_prefix=nw.col("fruits").str.starts_with("app"))
+            ┌───────────────────┐
+            |Narwhals DataFrame |
+            |-------------------|
+            |  fruits has_prefix|
+            |0  apple       True|
+            |1  mango      False|
+            |2   None       None|
+            └───────────────────┘
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.starts_with(prefix)
+        )
+
+    def ends_with(self, suffix: str) -> ExprT:
+        r"""Check if string values end with a substring.
+
+        Arguments:
+            suffix: suffix substring
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import narwhals as nw
+            >>> df_native = pd.DataFrame({"fruits": ["apple", "mango", None]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(has_suffix=nw.col("fruits").str.ends_with("ngo"))
+            ┌───────────────────┐
+            |Narwhals DataFrame |
+            |-------------------|
+            |  fruits has_suffix|
+            |0  apple      False|
+            |1  mango       True|
+            |2   None       None|
+            └───────────────────┘
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.ends_with(suffix)
+        )
+
+    def contains(self, pattern: str, *, literal: bool = False) -> ExprT:
+        r"""Check if string contains a substring that matches a pattern.
+
+        Arguments:
+            pattern: A Character sequence or valid regular expression pattern.
+            literal: If True, treats the pattern as a literal string.
+                     If False, assumes the pattern is a regular expression.
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import pyarrow as pa
+            >>> import narwhals as nw
+            >>> df_native = pa.table({"pets": ["cat", "dog", "rabbit and parrot"]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(
+            ...     default_match=nw.col("pets").str.contains("cat|parrot"),
+            ...     case_insensitive_match=nw.col("pets").str.contains("cat|(?i)parrot"),
+            ... ).to_native()
+            pyarrow.Table
+            pets: string
+            default_match: bool
+            case_insensitive_match: bool
+            ----
+            pets: [["cat","dog","rabbit and parrot"]]
+            default_match: [[true,false,true]]
+            case_insensitive_match: [[true,false,true]]
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.contains(
+                pattern, literal=literal
+            )
+        )
+
+    def slice(self, offset: int, length: int | None = None) -> ExprT:
+        r"""Create subslices of the string values of an expression.
+
+        Arguments:
+            offset: Start index. Negative indexing is supported.
+            length: Length of the slice. If set to `None` (default), the slice is taken to the
+                end of the string.
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import narwhals as nw
+            >>> df_native = pd.DataFrame({"s": ["pear", None, "papaya"]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(s_sliced=nw.col("s").str.slice(4, length=3))
+            ┌──────────────────┐
+            |Narwhals DataFrame|
+            |------------------|
+            |        s s_sliced|
+            |0    pear         |
+            |1    None     None|
+            |2  papaya       ya|
+            └──────────────────┘
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.slice(
+                offset=offset, length=length
+            )
+        )
+
+    def split(self, by: str) -> ExprT:
+        r"""Split the string values of an expression by a substring.
+
+        Arguments:
+            by: Substring to split by.
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import polars as pl
+            >>> import narwhals as nw
+            >>> df_native = pl.DataFrame({"s": ["foo bar", "foo_bar"]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(nw.col("s").str.split("_").alias("s_split"))
+            ┌────────────────────────────┐
+            |     Narwhals DataFrame     |
+            |----------------------------|
+            |shape: (2, 2)               |
+            |┌─────────┬────────────────┐|
+            |│ s       ┆ s_split        │|
+            |│ ---     ┆ ---            │|
+            |│ str     ┆ list[str]      │|
+            |╞═════════╪════════════════╡|
+            |│ foo bar ┆ ["foo bar"]    │|
+            |│ foo_bar ┆ ["foo", "bar"] │|
+            |└─────────┴────────────────┘|
+            └────────────────────────────┘
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.split(by=by)
+        )
+
+    def head(self, n: int = 5) -> ExprT:
+        r"""Take the first n elements of each string.
+
+        Arguments:
+            n: Number of elements to take. Negative indexing is **not** supported.
+
+        Returns:
+            A new expression.
+
+        Notes:
+            If the length of the string has fewer than `n` characters, the full string is returned.
+
+        Examples:
+            >>> import pyarrow as pa
+            >>> import narwhals as nw
+            >>> df_native = pa.table({"lyrics": ["taata", "taatatata", "zukkyun"]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(lyrics_head=nw.col("lyrics").str.head()).to_native()
+            pyarrow.Table
+            lyrics: string
+            lyrics_head: string
+            ----
+            lyrics: [["taata","taatatata","zukkyun"]]
+            lyrics_head: [["taata","taata","zukky"]]
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.slice(0, n)
+        )
+
+    def tail(self, n: int = 5) -> ExprT:
+        r"""Take the last n elements of each string.
+
+        Arguments:
+            n: Number of elements to take. Negative indexing is **not** supported.
+
+        Returns:
+            A new expression.
+
+        Notes:
+            If the length of the string has fewer than `n` characters, the full string is returned.
+
+        Examples:
+            >>> import pyarrow as pa
+            >>> import narwhals as nw
+            >>> df_native = pa.table({"lyrics": ["taata", "taatatata", "zukkyun"]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(lyrics_tail=nw.col("lyrics").str.tail()).to_native()
+            pyarrow.Table
+            lyrics: string
+            lyrics_tail: string
+            ----
+            lyrics: [["taata","taatatata","zukkyun"]]
+            lyrics_tail: [["taata","atata","kkyun"]]
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.slice(
+                offset=-n, length=None
+            )
+        )
+
+    def to_datetime(self, format: str | None = None) -> ExprT:
+        """Convert to Datetime dtype.
+
+        Notes:
+            - pandas defaults to nanosecond time unit, Polars to microsecond.
+              Prior to pandas 2.0, nanoseconds were the only time unit supported
+              in pandas, with no ability to set any other one. The ability to
+              set the time unit in pandas, if the version permits, will arrive.
+            - timezone-aware strings are all converted to and parsed as UTC.
+
+        Warning:
+            As different backends auto-infer format in different ways, if `format=None`
+            there is no guarantee that the result will be equal.
+
+        Arguments:
+            format: Format to use for conversion. If set to None (default), the format is
+                inferred from the data.
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import polars as pl
+            >>> import narwhals as nw
+            >>> df_native = pl.DataFrame({"a": ["2020-01-01", "2020-01-02"]})
+            >>> df = nw.from_native(df_native)
+            >>> df.select(nw.col("a").str.to_datetime(format="%Y-%m-%d"))
+            ┌───────────────────────┐
+            |  Narwhals DataFrame   |
+            |-----------------------|
+            |shape: (2, 1)          |
+            |┌─────────────────────┐|
+            |│ a                   │|
+            |│ ---                 │|
+            |│ datetime[μs]        │|
+            |╞═════════════════════╡|
+            |│ 2020-01-01 00:00:00 │|
+            |│ 2020-01-02 00:00:00 │|
+            |└─────────────────────┘|
+            └───────────────────────┘
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.to_datetime(format=format)
+        )
+
+    def to_uppercase(self) -> ExprT:
+        r"""Transform string to uppercase variant.
+
+        Returns:
+            A new expression.
+
+        Notes:
+            The PyArrow backend will convert 'ß' to 'ẞ' instead of 'SS'.
+            For more info see [the related issue](https://github.com/apache/arrow/issues/34599).
+            There may be other unicode-edge-case-related variations across implementations.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import narwhals as nw
+            >>> df_native = pd.DataFrame({"fruits": ["apple", None]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(upper_col=nw.col("fruits").str.to_uppercase())
+            ┌──────────────────┐
+            |Narwhals DataFrame|
+            |------------------|
+            |  fruits upper_col|
+            |0  apple     APPLE|
+            |1   None      None|
+            └──────────────────┘
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.to_uppercase()
+        )
+
+    def to_lowercase(self) -> ExprT:
+        r"""Transform string to lowercase variant.
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import narwhals as nw
+            >>> df_native = pd.DataFrame({"fruits": ["APPLE", None]})
+            >>> df = nw.from_native(df_native)
+            >>> df.with_columns(lower_col=nw.col("fruits").str.to_lowercase())
+            ┌──────────────────┐
+            |Narwhals DataFrame|
+            |------------------|
+            |  fruits lower_col|
+            |0  APPLE     apple|
+            |1   None      None|
+            └──────────────────┘
+        """
+        return self._expr._with_elementwise_op(
+            lambda plx: self._expr._to_compliant_expr(plx).str.to_lowercase()
+        )
author	sotech117 <michael_foiani@brown.edu>	2025-07-31 17:27:24 -0400
committer	sotech117 <michael_foiani@brown.edu>	2025-07-31 17:27:24 -0400
commit	5bf22fc7e3c392c8bd44315ca2d06d7dca7d084e (patch)
tree	8dacb0f195df1c0788d36dd0064f6bbaa3143ede /venv/lib/python3.8/site-packages/narwhals/expr_str.py
parent	b832d364da8c2efe09e3f75828caf73c50d01ce3 (diff)