From 7bb70863a240101e58fa8f1141f2c0ffd59a53e9 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Sun, 10 Mar 2024 15:31:58 -0400 Subject: [PATCH] Addressed feedback from PR review - Added comments to the "aggregationFunctions" method in Min, Max, MinLength and MaxLength analyzers. - Refactored the criterion method to reuse an existing variable. --- src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala | 5 ++++- src/main/scala/com/amazon/deequ/analyzers/Maximum.scala | 3 +++ src/main/scala/com/amazon/deequ/analyzers/MinLength.scala | 5 ++++- src/main/scala/com/amazon/deequ/analyzers/Minimum.scala | 3 +++ 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala b/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala index 69a39eb1..141d92fb 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/MaxLength.scala @@ -36,6 +36,9 @@ case class MaxLength(column: String, where: Option[String] = None, analyzerOptio with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { + // The criterion returns a column where each row contains an array of 2 elements. + // The first element of the array is a string that indicates if the row is "in scope" or "filtered" out. + // The second element is the value used for calculating the metric. We use "element_at" to extract it. max(element_at(criterion, 2).cast(DoubleType)) :: Nil } @@ -58,7 +61,7 @@ case class MaxLength(column: String, where: Option[String] = None, analyzerOptio case NullBehavior.Fail => when(isNullCheck, Double.MaxValue).otherwise(colLength) // Empty String is 0 length string case NullBehavior.EmptyString => when(isNullCheck, lit(0.0)).otherwise(colLength) - case NullBehavior.Ignore => length(col(column)) + case NullBehavior.Ignore => colLength } conditionalSelectionWithAugmentedOutcome(updatedColumn, where) diff --git a/src/main/scala/com/amazon/deequ/analyzers/Maximum.scala b/src/main/scala/com/amazon/deequ/analyzers/Maximum.scala index 90143f5e..1e52a7ae 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/Maximum.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/Maximum.scala @@ -46,6 +46,9 @@ case class Maximum(column: String, where: Option[String] = None, analyzerOptions with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { + // The criterion returns a column where each row contains an array of 2 elements. + // The first element of the array is a string that indicates if the row is "in scope" or "filtered" out. + // The second element is the value used for calculating the metric. We use "element_at" to extract it. max(element_at(criterion, 2).cast(DoubleType)) :: Nil } diff --git a/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala b/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala index dd982b4e..ddc4497b 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/MinLength.scala @@ -36,6 +36,9 @@ case class MinLength(column: String, where: Option[String] = None, analyzerOptio with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { + // The criterion returns a column where each row contains an array of 2 elements. + // The first element of the array is a string that indicates if the row is "in scope" or "filtered" out. + // The second element is the value used for calculating the metric. We use "element_at" to extract it. min(element_at(criterion, 2).cast(DoubleType)) :: Nil } @@ -58,7 +61,7 @@ case class MinLength(column: String, where: Option[String] = None, analyzerOptio case NullBehavior.Fail => when(isNullCheck, Double.MinValue).otherwise(colLength) // Empty String is 0 length string case NullBehavior.EmptyString => when(isNullCheck, lit(0.0)).otherwise(colLength) - case NullBehavior.Ignore => length(col(column)) + case NullBehavior.Ignore => colLength } conditionalSelectionWithAugmentedOutcome(updatedColumn, where) diff --git a/src/main/scala/com/amazon/deequ/analyzers/Minimum.scala b/src/main/scala/com/amazon/deequ/analyzers/Minimum.scala index ffc78d27..701ae0f0 100644 --- a/src/main/scala/com/amazon/deequ/analyzers/Minimum.scala +++ b/src/main/scala/com/amazon/deequ/analyzers/Minimum.scala @@ -46,6 +46,9 @@ case class Minimum(column: String, where: Option[String] = None, analyzerOptions with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { + // The criterion returns a column where each row contains an array of 2 elements. + // The first element of the array is a string that indicates if the row is "in scope" or "filtered" out. + // The second element is the value used for calculating the metric. We use "element_at" to extract it. min(element_at(criterion, 2).cast(DoubleType)) :: Nil }