Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in MinLength and MaxLength when NullBehavior.EmptyString #538

Merged
merged 1 commit into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ case class MaxLength(column: String, where: Option[String] = None, analyzerOptio
case NullBehavior.Fail =>
conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = Double.MaxValue)
case NullBehavior.EmptyString =>
length(conditionSelectionGivenColumn(col(column), Option(isNullCheck), replaceWith = "")).cast(DoubleType)
// Empty String is 0 length string
conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = 0.0).cast(DoubleType)
case _ =>
colLengths
}
Expand Down
3 changes: 2 additions & 1 deletion src/main/scala/com/amazon/deequ/analyzers/MinLength.scala
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ case class MinLength(column: String, where: Option[String] = None, analyzerOptio
case NullBehavior.Fail =>
conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = Double.MinValue)
case NullBehavior.EmptyString =>
length(conditionSelectionGivenColumn(col(column), Option(isNullCheck), replaceWith = "")).cast(DoubleType)
// Empty String is 0 length string
conditionSelectionGivenColumn(colLengths, Option(isNullCheck), replaceWith = 0.0).cast(DoubleType)
case _ =>
colLengths
}
Expand Down
41 changes: 41 additions & 0 deletions src/test/scala/com/amazon/deequ/VerificationSuiteTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,47 @@ class VerificationSuiteTest extends WordSpec with Matchers with SparkContextSpec
assert(Seq(false, null, false, true, null, true).sameElements(rowLevel4))
}

"confirm that minLength and maxLength properly filters with nullBehavior empty" in withSparkSession { session =>
val data = getDfCompleteAndInCompleteColumnsAndVarLengthStrings(session)

val minLength = new Check(CheckLevel.Error, "rule1")
.hasMinLength("item", _ > 3,
analyzerOptions = Option(AnalyzerOptions(NullBehavior.EmptyString, FilteredRowOutcome.NULL)))
.where("val1 > 3")
val maxLength = new Check(CheckLevel.Error, "rule2")
.hasMaxLength("item", _ <= 3,
analyzerOptions = Option(AnalyzerOptions(NullBehavior.EmptyString, FilteredRowOutcome.NULL)))
.where("val1 < 4")

val expectedColumn1 = minLength.description
val expectedColumn2 = maxLength.description

val suite = new VerificationSuite().onData(data)
.addCheck(minLength)
.addCheck(maxLength)

val result: VerificationResult = suite.run()

val resultData = VerificationResult.rowLevelResultsAsDataFrame(session, result, data)

resultData.show(false)

val expectedColumns: Set[String] =
data.columns.toSet + expectedColumn1 + expectedColumn2
assert(resultData.columns.toSet == expectedColumns)

// Unfiltered rows are all true - overall result should be Success
assert(result.status == CheckStatus.Success)

// minLength > 3 would fail for the first three rows (length 1,2,3)
val rowLevel1 = resultData.select(expectedColumn1).collect().map(r => r.getAs[Any](0))
assert(Seq(null, null, null, true, true, true).sameElements(rowLevel1))

// maxLength <= 3 would fail for the last three rows (length 4,5,6)
val rowLevel2 = resultData.select(expectedColumn2).collect().map(r => r.getAs[Any](0))
assert(Seq(true, true, true, null, null, null).sameElements(rowLevel2))
}

"generate a result that contains length row-level results with nullBehavior fail" in withSparkSession { session =>
val data = getDfCompleteAndInCompleteColumnsAndVarLengthStrings(session)

Expand Down
15 changes: 15 additions & 0 deletions src/test/scala/com/amazon/deequ/checks/CheckTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,21 @@ class CheckTest extends AnyWordSpec with Matchers with SparkContextSpec with Fix
assertSuccess(baseCheck.hasMaxLength("att1", _ == 4.0), context)
}

"yield correct results for minimum and maximum length stats with where clause" in
withSparkSession { sparkSession =>
val emptyNulLBehavior = Option(AnalyzerOptions(NullBehavior.EmptyString))
val baseCheck = Check(CheckLevel.Error, description = "a description")
val df = getDfCompleteAndInCompleteColumnsAndVarLengthStrings(sparkSession)
val context = AnalysisRunner.onData(df)
.addAnalyzers(Seq(MinLength("item", Option("val1 > 3"), emptyNulLBehavior),
MaxLength("item", Option("val1 <= 3"), emptyNulLBehavior))).run()

assertSuccess(baseCheck.hasMinLength("item", _ >= 4.0, analyzerOptions = emptyNulLBehavior)
.where("val1 > 3"), context) // 1 without where clause
assertSuccess(baseCheck.hasMaxLength("item", _ <= 3.0, analyzerOptions = emptyNulLBehavior)
.where("val1 <= 3"), context) // 6 without where clause
}

"work on regular expression patterns for E-Mails" in withSparkSession { sparkSession =>
val col = "some"
val df = dataFrameWithColumn(col, StringType, sparkSession, Row("someone@somewhere.org"),
Expand Down
Loading