diff --git a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lstrip/out.sql b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lstrip/out.sql index 361d6f5fb2e9..411a49821957 100644 --- a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lstrip/out.sql +++ b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/lstrip/out.sql @@ -1,3 +1,3 @@ SELECT - LTRIM(`t0`.`string_col`, ' \t\n\r\v\f') AS `LStrip(string_col)` + REGEXP_REPLACE(`t0`.`string_col`, '^\\s+', '') AS `LStrip(string_col)` FROM `functional_alltypes` AS `t0` \ No newline at end of file diff --git a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rstrip/out.sql b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rstrip/out.sql index 4506448e7e27..93ddcb55c9ae 100644 --- a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rstrip/out.sql +++ b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/rstrip/out.sql @@ -1,3 +1,3 @@ SELECT - RTRIM(`t0`.`string_col`, ' \t\n\r\v\f') AS `RStrip(string_col)` + REGEXP_REPLACE(`t0`.`string_col`, '\\s+$', '') AS `RStrip(string_col)` FROM `functional_alltypes` AS `t0` \ No newline at end of file diff --git a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/strip/out.sql b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/strip/out.sql index bb26534ade86..64c67f6a524e 100644 --- a/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/strip/out.sql +++ b/ibis/backends/impala/tests/snapshots/test_string_builtins/test_string_builtins/strip/out.sql @@ -1,3 +1,3 @@ SELECT - RTRIM(LTRIM(`t0`.`string_col`, ' \t\n\r\v\f'), ' \t\n\r\v\f') AS `Strip(string_col)` + REGEXP_REPLACE(`t0`.`string_col`, '^\\s+|\\s+$', '') AS `Strip(string_col)` FROM `functional_alltypes` AS `t0` \ No newline at end of file diff --git a/ibis/backends/sql/compilers/impala.py b/ibis/backends/sql/compilers/impala.py index 244daca67c89..f0381c9fac20 100644 --- a/ibis/backends/sql/compilers/impala.py +++ b/ibis/backends/sql/compilers/impala.py @@ -1,7 +1,5 @@ from __future__ import annotations -from string import whitespace as WHITESPACE - import sqlglot as sg import sqlglot.expressions as sge @@ -45,6 +43,7 @@ class ImpalaCompiler(SQLGlotCompiler): ops.ArgMin, ops.Covariance, ops.ExtractDayOfYear, + ops.Kurtosis, ops.Levenshtein, ops.Map, ops.Median, @@ -59,7 +58,6 @@ class ImpalaCompiler(SQLGlotCompiler): ops.TimestampBucket, ops.TimestampDelta, ops.Unnest, - ops.Kurtosis, ) SIMPLE_OPS = { @@ -78,8 +76,8 @@ class ImpalaCompiler(SQLGlotCompiler): ops.ExtractEpochSeconds: "unix_timestamp", ops.Hash: "fnv_hash", ops.Ln: "ln", - ops.TypeOf: "typeof", ops.RegexReplace: "regexp_replace", + ops.TypeOf: "typeof", } @staticmethod @@ -325,16 +323,13 @@ def visit_DateDelta(self, op, *, left, right, part): return self.f.datediff(left, right) def visit_LStrip(self, op, *, arg): - return self.f.anon.ltrim(arg, WHITESPACE) + return self.f.regexp_replace(arg, r"^\s+", "") def visit_RStrip(self, op, *, arg): - return self.f.anon.rtrim(arg, WHITESPACE) + return self.f.regexp_replace(arg, r"\s+$", "") def visit_Strip(self, op, *, arg): - # Impala's `TRIM` doesn't allow specifying characters to trim off, unlike - # Impala's `RTRIM` and `LTRIM` which accept a set of characters to - # remove. - return self.f.anon.rtrim(self.f.anon.ltrim(arg, WHITESPACE), WHITESPACE) + return self.f.regexp_replace(arg, r"^\s+|\s+$", "") compiler = ImpalaCompiler() diff --git a/ibis/backends/sql/compilers/pyspark.py b/ibis/backends/sql/compilers/pyspark.py index c4a4d741d659..b85267be2b8b 100644 --- a/ibis/backends/sql/compilers/pyspark.py +++ b/ibis/backends/sql/compilers/pyspark.py @@ -85,8 +85,6 @@ class PySparkCompiler(SQLGlotCompiler): ops.EndsWith: "endswith", ops.Hash: "hash", ops.Log10: "log10", - ops.LStrip: "ltrim", - ops.RStrip: "rtrim", ops.MapLength: "size", ops.MapContains: "map_contains_key", ops.MapMerge: "map_concat", @@ -684,5 +682,14 @@ def visit_ArraySum(self, op, *, arg): def visit_ArrayMean(self, op, *, arg): return self._array_reduction(dtype=op.dtype, arg=arg, output=operator.truediv) + def visit_LStrip(self, op, *, arg): + return self.f.regexp_replace(arg, r"^\s+", "") + + def visit_RStrip(self, op, *, arg): + return self.f.regexp_replace(arg, r"\s+$", "") + + def visit_Strip(self, op, *, arg): + return self.f.regexp_replace(arg, r"^\s+|\s+$", "") + compiler = PySparkCompiler() diff --git a/ibis/backends/tests/test_string.py b/ibis/backends/tests/test_string.py index da33489ba703..1346b404316f 100644 --- a/ibis/backends/tests/test_string.py +++ b/ibis/backends/tests/test_string.py @@ -1144,8 +1144,9 @@ def string_temp_table(backend, con): "aBc", "๐Ÿ", "ร‰รฉรˆรจรชรง", + "fluf\f", ], - "index_col": [0, 1, 2, 3, 4, 5, 6], + "index_col": [0, 1, 2, 3, 4, 5, 6, 7], } ) @@ -1277,7 +1278,7 @@ def string_temp_table(backend, con): ), param( lambda t: t.string_col.find_in_set(["aBc", "123"]), - lambda _: pd.Series([-1, -1, -1, 1, 0, -1, -1], name="tmp"), + lambda _: pd.Series([-1, -1, -1, 1, 0, -1, -1, -1], name="tmp"), id="find_in_set", marks=[ pytest.mark.notyet( @@ -1306,7 +1307,7 @@ def string_temp_table(backend, con): ), param( lambda t: t.string_col.find_in_set(["abc, 123"]), - lambda _: pd.Series([-1, -1, -1, -1, -1, -1, -1], name="tmp"), + lambda _: pd.Series([-1, -1, -1, -1, -1, -1, -1, -1], name="tmp"), id="find_in_set_w_comma", marks=[ pytest.mark.notyet( @@ -1346,25 +1347,11 @@ def string_temp_table(backend, con): lambda t: t.string_col.lstrip(), lambda t: t.str.lstrip(), id="lstrip", - marks=[ - pytest.mark.notyet( - ["pyspark", "databricks"], - raises=AssertionError, - reason="Spark SQL LTRIM doesn't accept characters to trim", - ), - ], ), param( lambda t: t.string_col.rstrip(), lambda t: t.str.rstrip(), id="rstrip", - marks=[ - pytest.mark.notyet( - ["pyspark", "databricks"], - raises=AssertionError, - reason="Spark SQL RTRIM doesn't accept characters to trim", - ), - ], ), param( lambda t: t.string_col.strip(), @@ -1423,6 +1410,7 @@ def test_string_methods_accents_and_emoji( โ”‚ aBc โ”‚ โ”‚ ๐Ÿ โ”‚ โ”‚ ร‰รฉรˆรจรชรง โ”‚ + โ”‚ fluf\f โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ """ t = string_temp_table