From 9d398a1007b757ff150dbc85d5a8b43f07b284a2 Mon Sep 17 00:00:00 2001 From: Edward Cho <114528615+eycho-am@users.noreply.github.com> Date: Mon, 26 Feb 2024 10:08:27 -0500 Subject: [PATCH] Fix bug in MinLength and MaxLength analyzers where given the NullBehavior.EmptyString option, the where filter wasn't properly applied (#538) --- .../amazon/deequ/analyzers/MaxLength.scala | 3 +- .../amazon/deequ/analyzers/MinLength.scala | 3 +- .../amazon/deequ/VerificationSuiteTest.scala | 41 +++++++++++++++++++ .../com/amazon/deequ/checks/CheckTest.scala | 15 +++++++ 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala b/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala index 19c9ca9b..3b55d4fa 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala @@ -75,7 +75,8 @@ case class MaxLength(column: String, where: Option[String] = None, analyzerOptio case NullBehavior.Fail => conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = Double.MaxValue) case NullBehavior.EmptyString => - length(conditionSelectionGivenColumn(col(column), Option(isNullCheck), replaceWith = "")).cast(DoubleType) + // Empty String is 0 length string + conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = 0.0).cast(DoubleType) case _ => colLengths } diff --git a/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala b/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala index c155cca9..a6627d2d 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala @@ -75,7 +75,8 @@ case class MinLength(column: String, where: Option[String] = None, analyzerOptio case NullBehavior.Fail => conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = Double.MinValue) case NullBehavior.EmptyString => - length(conditionSelectionGivenColumn(col(column), Option(isNullCheck), replaceWith = "")).cast(DoubleType) + // Empty String is 0 length string + conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = 0.0).cast(DoubleType) case _ => colLengths } diff --git a/src/test/scala/com/amazon/deequ/VerificationSuiteTest.scala b/src/test/scala/com/amazon/deequ/VerificationSuiteTest.scala index 8de81117..9da41562 100644 --- a/src/test/scala/com/amazon/deequ/VerificationSuiteTest.scala +++ b/src/test/scala/com/amazon/deequ/VerificationSuiteTest.scala @@ -600,6 +600,47 @@ class VerificationSuiteTest extends WordSpec with Matchers with SparkContextSpec assert(Seq(false, null, false, true, null, true).sameElements(rowLevel4)) } + "confirm that minLength and maxLength properly filters with nullBehavior empty" in withSparkSession { session => + val data = getDfCompleteAndInCompleteColumnsAndVarLengthStrings(session) + + val minLength = new Check(CheckLevel.Error, "rule1") + .hasMinLength("item", _ > 3, + analyzerOptions = Option(AnalyzerOptions(NullBehavior.EmptyString, FilteredRowOutcome.NULL))) + .where("val1 > 3") + val maxLength = new Check(CheckLevel.Error, "rule2") + .hasMaxLength("item", _ <= 3, + analyzerOptions = Option(AnalyzerOptions(NullBehavior.EmptyString, FilteredRowOutcome.NULL))) + .where("val1 < 4") + + val expectedColumn1 = minLength.description + val expectedColumn2 = maxLength.description + + val suite = new VerificationSuite().onData(data) + .addCheck(minLength) + .addCheck(maxLength) + + val result: VerificationResult = suite.run() + + val resultData = VerificationResult.rowLevelResultsAsDataFrame(session, result, data) + + resultData.show(false) + + val expectedColumns: Set[String] = + data.columns.toSet + expectedColumn1 + expectedColumn2 + assert(resultData.columns.toSet == expectedColumns) + + // Unfiltered rows are all true - overall result should be Success + assert(result.status == CheckStatus.Success) + + // minLength > 3 would fail for the first three rows (length 1,2,3) + val rowLevel1 = resultData.select(expectedColumn1).collect().map(r => r.getAs[Any](0)) + assert(Seq(null, null, null, true, true, true).sameElements(rowLevel1)) + + // maxLength <= 3 would fail for the last three rows (length 4,5,6) + val rowLevel2 = resultData.select(expectedColumn2).collect().map(r => r.getAs[Any](0)) + assert(Seq(true, true, true, null, null, null).sameElements(rowLevel2)) + } + "generate a result that contains length row-level results with nullBehavior fail" in withSparkSession { session => val data = getDfCompleteAndInCompleteColumnsAndVarLengthStrings(session) diff --git a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala index bc20b095..eab056f3 100644 --- a/src/test/scala/com/amazon/deequ/checks/CheckTest.scala +++ b/src/test/scala/com/amazon/deequ/checks/CheckTest.scala @@ -697,6 +697,21 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix assertSuccess(baseCheck.hasMaxLength("att1", _ == 4.0), context) } + "yield correct results for minimum and maximum length stats with where clause" in + withSparkSession { sparkSession => + val emptyNulLBehavior = Option(AnalyzerOptions(NullBehavior.EmptyString)) + val baseCheck = Check(CheckLevel.Error, description = "a description") + val df = getDfCompleteAndInCompleteColumnsAndVarLengthStrings(sparkSession) + val context = AnalysisRunner.onData(df) + .addAnalyzers(Seq(MinLength("item", Option("val1 > 3"), emptyNulLBehavior), + MaxLength("item", Option("val1 <= 3"), emptyNulLBehavior))).run() + + assertSuccess(baseCheck.hasMinLength("item", _ >= 4.0, analyzerOptions = emptyNulLBehavior) + .where("val1 > 3"), context) // 1 without where clause + assertSuccess(baseCheck.hasMaxLength("item", _ <= 3.0, analyzerOptions = emptyNulLBehavior) + .where("val1 <= 3"), context) // 6 without where clause + } + "work on regular expression patterns for E-Mails" in withSparkSession { sparkSession => val col = "some" val df = dataFrameWithColumn(col, StringType, sparkSession, Row("someone@somewhere.org"),