Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
LTRIM(`t0`.`string_col`, ' \t\n\r\v\f') AS `LStrip(string_col)`
REGEXP_REPLACE(`t0`.`string_col`, '^\\s+', '') AS `LStrip(string_col)`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
RTRIM(`t0`.`string_col`, ' \t\n\r\v\f') AS `RStrip(string_col)`
REGEXP_REPLACE(`t0`.`string_col`, '\\s+$', '') AS `RStrip(string_col)`
FROM `functional_alltypes` AS `t0`
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
SELECT
RTRIM(LTRIM(`t0`.`string_col`, ' \t\n\r\v\f'), ' \t\n\r\v\f') AS `Strip(string_col)`
REGEXP_REPLACE(`t0`.`string_col`, '^\\s+|\\s+$', '') AS `Strip(string_col)`
FROM `functional_alltypes` AS `t0`
15 changes: 5 additions & 10 deletions ibis/backends/sql/compilers/impala.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from string import whitespace as WHITESPACE

import sqlglot as sg
import sqlglot.expressions as sge

Expand Down Expand Up @@ -45,6 +43,7 @@ class ImpalaCompiler(SQLGlotCompiler):
ops.ArgMin,
ops.Covariance,
ops.ExtractDayOfYear,
ops.Kurtosis,
ops.Levenshtein,
ops.Map,
ops.Median,
Expand All @@ -59,7 +58,6 @@ class ImpalaCompiler(SQLGlotCompiler):
ops.TimestampBucket,
ops.TimestampDelta,
ops.Unnest,
ops.Kurtosis,
)

SIMPLE_OPS = {
Expand All @@ -78,8 +76,8 @@ class ImpalaCompiler(SQLGlotCompiler):
ops.ExtractEpochSeconds: "unix_timestamp",
ops.Hash: "fnv_hash",
ops.Ln: "ln",
ops.TypeOf: "typeof",
ops.RegexReplace: "regexp_replace",
ops.TypeOf: "typeof",
}

@staticmethod
Expand Down Expand Up @@ -325,16 +323,13 @@ def visit_DateDelta(self, op, *, left, right, part):
return self.f.datediff(left, right)

def visit_LStrip(self, op, *, arg):
return self.f.anon.ltrim(arg, WHITESPACE)
return self.f.regexp_replace(arg, r"^\s+", "")

def visit_RStrip(self, op, *, arg):
return self.f.anon.rtrim(arg, WHITESPACE)
return self.f.regexp_replace(arg, r"\s+$", "")

def visit_Strip(self, op, *, arg):
# Impala's `TRIM` doesn't allow specifying characters to trim off, unlike
# Impala's `RTRIM` and `LTRIM` which accept a set of characters to
# remove.
return self.f.anon.rtrim(self.f.anon.ltrim(arg, WHITESPACE), WHITESPACE)
return self.f.regexp_replace(arg, r"^\s+|\s+$", "")


compiler = ImpalaCompiler()
11 changes: 9 additions & 2 deletions ibis/backends/sql/compilers/pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,6 @@ class PySparkCompiler(SQLGlotCompiler):
ops.EndsWith: "endswith",
ops.Hash: "hash",
ops.Log10: "log10",
ops.LStrip: "ltrim",
ops.RStrip: "rtrim",
ops.MapLength: "size",
ops.MapContains: "map_contains_key",
ops.MapMerge: "map_concat",
Expand Down Expand Up @@ -684,5 +682,14 @@ def visit_ArraySum(self, op, *, arg):
def visit_ArrayMean(self, op, *, arg):
return self._array_reduction(dtype=op.dtype, arg=arg, output=operator.truediv)

def visit_LStrip(self, op, *, arg):
return self.f.regexp_replace(arg, r"^\s+", "")

def visit_RStrip(self, op, *, arg):
return self.f.regexp_replace(arg, r"\s+$", "")

def visit_Strip(self, op, *, arg):
return self.f.regexp_replace(arg, r"^\s+|\s+$", "")


compiler = PySparkCompiler()
22 changes: 5 additions & 17 deletions ibis/backends/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1144,8 +1144,9 @@ def string_temp_table(backend, con):
"aBc",
"🐍",
"ÉéÈèêç",
"fluf\f",
],
"index_col": [0, 1, 2, 3, 4, 5, 6],
"index_col": [0, 1, 2, 3, 4, 5, 6, 7],
}
)

Expand Down Expand Up @@ -1277,7 +1278,7 @@ def string_temp_table(backend, con):
),
param(
lambda t: t.string_col.find_in_set(["aBc", "123"]),
lambda _: pd.Series([-1, -1, -1, 1, 0, -1, -1], name="tmp"),
lambda _: pd.Series([-1, -1, -1, 1, 0, -1, -1, -1], name="tmp"),
id="find_in_set",
marks=[
pytest.mark.notyet(
Expand Down Expand Up @@ -1306,7 +1307,7 @@ def string_temp_table(backend, con):
),
param(
lambda t: t.string_col.find_in_set(["abc, 123"]),
lambda _: pd.Series([-1, -1, -1, -1, -1, -1, -1], name="tmp"),
lambda _: pd.Series([-1, -1, -1, -1, -1, -1, -1, -1], name="tmp"),
id="find_in_set_w_comma",
marks=[
pytest.mark.notyet(
Expand Down Expand Up @@ -1346,25 +1347,11 @@ def string_temp_table(backend, con):
lambda t: t.string_col.lstrip(),
lambda t: t.str.lstrip(),
id="lstrip",
marks=[
pytest.mark.notyet(
["pyspark", "databricks"],
raises=AssertionError,
reason="Spark SQL LTRIM doesn't accept characters to trim",
),
],
),
param(
lambda t: t.string_col.rstrip(),
lambda t: t.str.rstrip(),
id="rstrip",
marks=[
pytest.mark.notyet(
["pyspark", "databricks"],
raises=AssertionError,
reason="Spark SQL RTRIM doesn't accept characters to trim",
),
],
),
param(
lambda t: t.string_col.strip(),
Expand Down Expand Up @@ -1423,6 +1410,7 @@ def test_string_methods_accents_and_emoji(
│ aBc │
│ 🐍 │
│ ÉéÈèêç │
│ fluf\f │
└────────────┘
"""
t = string_temp_table
Expand Down
Loading