Skip to content

Commit

Permalink
Add Unit Tests for Pattern Match
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonhorner committed Sep 8, 2024
1 parent 9a03131 commit 18b98c5
Show file tree
Hide file tree
Showing 2 changed files with 203 additions and 75 deletions.
257 changes: 183 additions & 74 deletions src/test/scala/com/amazon/deequ/analyzers/PatternMatchTest.scala
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
/**
* Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/
/** Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package com.amazon.deequ.analyzers

Expand All @@ -22,82 +20,193 @@ import com.amazon.deequ.utils.FixtureSupport
import org.scalatest.matchers.should.Matchers
import org.scalatest.wordspec.AnyWordSpec

class PatternMatchTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport {
class PatternMatchTest
extends AnyWordSpec
with Matchers
with SparkContextSpec
with FixtureSupport {

"PatternMatch" should {
"return row-level results for non-null columns" in withSparkSession { session =>

val data = getDfWithStringColumns(session)

val patternMatchCountry = PatternMatch("Address Line 1", """\d""".r)
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)

data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
Seq(true, true, true, true, true, true, true, true)
"return row-level results for non-null columns" in withSparkSession {
session =>
val data = getDfWithStringColumns(session)

val patternMatchCountry = PatternMatch("Address Line 1", """\d""".r)
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn =
patternMatchCountry.computeMetricFrom(state)

data
.withColumn("new", metric.fullColumn.get)
.collect()
.map(_.getAs[Any]("new")) shouldBe
Seq(true, true, true, true, true, true, true, true)
}

"return row-level results for non-null columns starts with digit" in withSparkSession { session =>

val data = getDfWithStringColumns(session)

val patternMatchCountry = PatternMatch("Address Line 1", """(^[0-4])""".r)
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)

data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
Seq(false, false, true, true, false, false, true, true)
"return row-level results for non-null columns starts with digit" in withSparkSession {
session =>
val data = getDfWithStringColumns(session)

val patternMatchCountry =
PatternMatch("Address Line 1", """(^[0-4])""".r)
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn =
patternMatchCountry.computeMetricFrom(state)

data
.withColumn("new", metric.fullColumn.get)
.collect()
.map(_.getAs[Any]("new")) shouldBe
Seq(false, false, true, true, false, false, true, true)
}

"return row-level results for non-null columns starts with digit filtered as true" in withSparkSession { session =>

val data = getDfWithStringColumns(session)

val patternMatchCountry = PatternMatch("Address Line 1", """(^[0-4])""".r, where = Option("id < 5"),
analyzerOptions = Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE)))
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)

data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
Seq(false, false, true, true, false, true, true, true)
"return row-level results for non-null columns starts with digit filtered as true" in withSparkSession {
session =>
val data = getDfWithStringColumns(session)

val patternMatchCountry = PatternMatch(
"Address Line 1",
"""(^[0-4])""".r,
where = Option("id < 5"),
analyzerOptions =
Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))
)
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn =
patternMatchCountry.computeMetricFrom(state)

data
.withColumn("new", metric.fullColumn.get)
.collect()
.map(_.getAs[Any]("new")) shouldBe
Seq(false, false, true, true, false, true, true, true)
}

"return row-level results for columns with nulls" in withSparkSession { session =>
"return row-level results for columns with nulls" in withSparkSession {
session =>
val data = getDfWithStringColumns(session)

val data = getDfWithStringColumns(session)
val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r)
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn =
patternMatchCountry.computeMetricFrom(state)

val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r)
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)

data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
Seq(true, true, true, true, false, true, true, false)
data
.withColumn("new", metric.fullColumn.get)
.collect()
.map(_.getAs[Any]("new")) shouldBe
Seq(true, true, true, true, false, true, true, false)
}

"return row-level results for columns with nulls filtered as true" in withSparkSession { session =>

val data = getDfWithStringColumns(session)

val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r, where = Option("id < 5"),
analyzerOptions = Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE)))
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)

data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
Seq(true, true, true, true, false, true, true, true)
"return row-level results for columns with nulls filtered as true" in withSparkSession {
session =>
val data = getDfWithStringColumns(session)

val patternMatchCountry = PatternMatch(
"Address Line 2",
"""\w""".r,
where = Option("id < 5"),
analyzerOptions =
Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))
)
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn =
patternMatchCountry.computeMetricFrom(state)

data
.withColumn("new", metric.fullColumn.get)
.collect()
.map(_.getAs[Any]("new")) shouldBe
Seq(true, true, true, true, false, true, true, true)
}

"return row-level results for columns with nulls filtered as null" in withSparkSession { session =>
"return row-level results for columns with nulls filtered as null" in withSparkSession {
session =>
val data = getDfWithStringColumns(session)

val patternMatchCountry = PatternMatch(
"Address Line 2",
"""\w""".r,
where = Option("id < 5"),
analyzerOptions =
Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.NULL))
)
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn =
patternMatchCountry.computeMetricFrom(state)

data
.withColumn("new", metric.fullColumn.get)
.collect()
.map(_.getAs[Any]("new")) shouldBe
Seq(true, true, true, true, false, null, null, null)
}

val data = getDfWithStringColumns(session)
"correctly identify valid and invalid US SSNs" in withSparkSession {
session =>
val data = getDfWithPatternMatch(session)

val patternMatchSSN = PatternMatch(
"SSN",
SSN_US,
analyzerOptions =
Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))
)
val state = patternMatchSSN.computeStateFrom(data)
val metric: DoubleMetric with FullColumn =
patternMatchSSN.computeMetricFrom(state)

data
.withColumn("SSN_Match", metric.fullColumn.get)
.collect()
.map(_.getAs[Any]("SSN_Match")) shouldBe
Seq(true, true, false, false, false, false, false, true, true, false,
true, false)
}

val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r, where = Option("id < 5"),
analyzerOptions = Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.NULL)))
val state = patternMatchCountry.computeStateFrom(data)
val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)
"correctly identify valid and invalid US phone numbers" in withSparkSession {
session =>
val data = getDfWithPatternMatch(session)

val patternMatchPhone = PatternMatch(
"PhoneNumber",
PHONE_NUMBER_US,
analyzerOptions =
Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))
)
val state = patternMatchPhone.computeStateFrom(data)
val metric: DoubleMetric with FullColumn =
patternMatchPhone.computeMetricFrom(state)

data
.withColumn("Phone_Match", metric.fullColumn.get)
.collect()
.map(_.getAs[Any]("Phone_Match")) shouldBe
Seq(true, true, true, true, true, true, true, true, true, true, false,
false)
}

data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
Seq(true, true, true, true, false, null, null, null)
"correctly identify valid and invalid US postal codes" in withSparkSession {
session =>
val data = getDfWithPatternMatch(session)

val patternMatchPostalCode = PatternMatch(
"PostalCode",
POSTAL_CODE_US,
analyzerOptions =
Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))
)
val state = patternMatchPostalCode.computeStateFrom(data)
val metric: DoubleMetric with FullColumn =
patternMatchPostalCode.computeMetricFrom(state)

data
.withColumn("PostalCode_Match", metric.fullColumn.get)
.collect()
.map(_.getAs[Any]("PostalCode_Match")) shouldBe
Seq(true, true, true, true, true, false, true, false, false, true,
true, false)
}

}
}
21 changes: 20 additions & 1 deletion src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,26 @@ trait FixtureSupport {
("bar", 20)
).toDF("name", "age")
}


def getDfWithPatternMatch(sparkSession: SparkSession): DataFrame = {
import sparkSession.implicits._

Seq(
// SSN PostalCode PhoneNumber
("123-45-6789", "12345", "+1 234-567-8901"), // All valid
("078-05-1120", "12345-6789", "(234) 567-8901 x1234"), // All valid with extension
("000-12-3456", "12345", "234-567-8901"), // Invalid SSN (000 area), valid others
("666-45-6789", "54321", "1234567890"), // Invalid SSN (666 area), valid phone
("900-12-3456", "54321-1234", "+1 (234) 567 8901"), // Invalid SSN (900 range), valid postal and phone
("123-45-0000", "ABCDE", "+1 234 567-8901"), // Invalid SSN (0000 serial), invalid postal code
("123-00-6789", "54321", "123-45-6789"), // Invalid SSN (00 group), phone looks like SSN
("123-45-6789", "123", "234-567-8901"), // Valid SSN, invalid postal (too short)
("123-45-6789", "123456789", "+1 234 567 8901 ext1234"),// Valid SSN, valid phone with extension, invalid postal (no dash)
("999-99-9999", "54321-6789", "234 567 8901"), // Invalid SSN (invalid area code 999), valid postal, valid phone
("123-45-6789", "54321", "1234"), // Valid SSN, valid postal, invalid phone (too short)
("", "", "") // Empty row, testing blanks
).toDF("SSN", "PostalCode", "PhoneNumber")
}
def getFakeNumericColumnProfileWithMinMaxMeanAndStdDev(
columnName: String,
completeness: Double,
Expand Down

0 comments on commit 18b98c5

Please sign in to comment.