Add Unit Tests for Pattern Match

awslabs · Sep 8, 2024 · 18b98c5 · 18b98c5
1 parent 9a03131
commit 18b98c5
Show file tree

Hide file tree

Showing 2 changed files with 203 additions and 75 deletions.
diff --git a/src/test/scala/com/amazon/deequ/analyzers/PatternMatchTest.scala b/src/test/scala/com/amazon/deequ/analyzers/PatternMatchTest.scala
@@ -1,18 +1,16 @@
-/**
- * Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License"). You may not
- * use this file except in compliance with the License. A copy of the License
- * is located at
- *
- *     http://aws.amazon.com/apache2.0/
- *
- * or in the "license" file accompanying this file. This file is distributed on
- * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
- * express or implied. See the License for the specific language governing
- * permissions and limitations under the License.
- *
- */
+/** Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+  *
+  * Licensed under the Apache License, Version 2.0 (the "License"). You may not
+  * use this file except in compliance with the License. A copy of the License
+  * is located at
+  *
+  * http://aws.amazon.com/apache2.0/
+  *
+  * or in the "license" file accompanying this file. This file is distributed on
+  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+  * express or implied. See the License for the specific language governing
+  * permissions and limitations under the License.
+  */
 
 package com.amazon.deequ.analyzers
 
@@ -22,82 +20,193 @@ import com.amazon.deequ.utils.FixtureSupport
 import org.scalatest.matchers.should.Matchers
 import org.scalatest.wordspec.AnyWordSpec
 
-class PatternMatchTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport {
+class PatternMatchTest
+    extends AnyWordSpec
+    with Matchers
+    with SparkContextSpec
+    with FixtureSupport {
 
   "PatternMatch" should {
-    "return row-level results for non-null columns" in withSparkSession { session =>
-
-      val data = getDfWithStringColumns(session)
-
-      val patternMatchCountry = PatternMatch("Address Line 1", """\d""".r)
-      val state = patternMatchCountry.computeStateFrom(data)
-      val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)
-
-      data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
-        Seq(true, true, true, true, true, true, true, true)
+    "return row-level results for non-null columns" in withSparkSession {
+      session =>
+        val data = getDfWithStringColumns(session)
+
+        val patternMatchCountry = PatternMatch("Address Line 1", """\d""".r)
+        val state = patternMatchCountry.computeStateFrom(data)
+        val metric: DoubleMetric with FullColumn =
+          patternMatchCountry.computeMetricFrom(state)
+
+        data
+          .withColumn("new", metric.fullColumn.get)
+          .collect()
+          .map(_.getAs[Any]("new")) shouldBe
+          Seq(true, true, true, true, true, true, true, true)
     }
 
-    "return row-level results for non-null columns starts with digit" in withSparkSession { session =>
-
-      val data = getDfWithStringColumns(session)
-
-      val patternMatchCountry = PatternMatch("Address Line 1", """(^[0-4])""".r)
-      val state = patternMatchCountry.computeStateFrom(data)
-      val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)
-
-      data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
-        Seq(false, false, true, true, false, false, true, true)
+    "return row-level results for non-null columns starts with digit" in withSparkSession {
+      session =>
+        val data = getDfWithStringColumns(session)
+
+        val patternMatchCountry =
+          PatternMatch("Address Line 1", """(^[0-4])""".r)
+        val state = patternMatchCountry.computeStateFrom(data)
+        val metric: DoubleMetric with FullColumn =
+          patternMatchCountry.computeMetricFrom(state)
+
+        data
+          .withColumn("new", metric.fullColumn.get)
+          .collect()
+          .map(_.getAs[Any]("new")) shouldBe
+          Seq(false, false, true, true, false, false, true, true)
     }
 
-    "return row-level results for non-null columns starts with digit filtered as true" in withSparkSession { session =>
-
-      val data = getDfWithStringColumns(session)
-
-      val patternMatchCountry = PatternMatch("Address Line 1", """(^[0-4])""".r, where = Option("id < 5"),
-        analyzerOptions = Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE)))
-      val state = patternMatchCountry.computeStateFrom(data)
-      val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)
-
-      data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
-        Seq(false, false, true, true, false, true, true, true)
+    "return row-level results for non-null columns starts with digit filtered as true" in withSparkSession {
+      session =>
+        val data = getDfWithStringColumns(session)
+
+        val patternMatchCountry = PatternMatch(
+          "Address Line 1",
+          """(^[0-4])""".r,
+          where = Option("id < 5"),
+          analyzerOptions =
+            Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))
+        )
+        val state = patternMatchCountry.computeStateFrom(data)
+        val metric: DoubleMetric with FullColumn =
+          patternMatchCountry.computeMetricFrom(state)
+
+        data
+          .withColumn("new", metric.fullColumn.get)
+          .collect()
+          .map(_.getAs[Any]("new")) shouldBe
+          Seq(false, false, true, true, false, true, true, true)
     }
 
-    "return row-level results for columns with nulls" in withSparkSession { session =>
+    "return row-level results for columns with nulls" in withSparkSession {
+      session =>
+        val data = getDfWithStringColumns(session)
 
-      val data = getDfWithStringColumns(session)
+        val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r)
+        val state = patternMatchCountry.computeStateFrom(data)
+        val metric: DoubleMetric with FullColumn =
+          patternMatchCountry.computeMetricFrom(state)
 
-      val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r)
-      val state = patternMatchCountry.computeStateFrom(data)
-      val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)
-
-      data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
-        Seq(true, true, true, true, false, true, true, false)
+        data
+          .withColumn("new", metric.fullColumn.get)
+          .collect()
+          .map(_.getAs[Any]("new")) shouldBe
+          Seq(true, true, true, true, false, true, true, false)
     }
 
-    "return row-level results for columns with nulls filtered as true" in withSparkSession { session =>
-
-      val data = getDfWithStringColumns(session)
-
-      val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r, where = Option("id < 5"),
-        analyzerOptions = Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE)))
-      val state = patternMatchCountry.computeStateFrom(data)
-      val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)
-
-      data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
-        Seq(true, true, true, true, false, true, true, true)
+    "return row-level results for columns with nulls filtered as true" in withSparkSession {
+      session =>
+        val data = getDfWithStringColumns(session)
+
+        val patternMatchCountry = PatternMatch(
+          "Address Line 2",
+          """\w""".r,
+          where = Option("id < 5"),
+          analyzerOptions =
+            Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))
+        )
+        val state = patternMatchCountry.computeStateFrom(data)
+        val metric: DoubleMetric with FullColumn =
+          patternMatchCountry.computeMetricFrom(state)
+
+        data
+          .withColumn("new", metric.fullColumn.get)
+          .collect()
+          .map(_.getAs[Any]("new")) shouldBe
+          Seq(true, true, true, true, false, true, true, true)
     }
 
-    "return row-level results for columns with nulls filtered as null" in withSparkSession { session =>
+    "return row-level results for columns with nulls filtered as null" in withSparkSession {
+      session =>
+        val data = getDfWithStringColumns(session)
+
+        val patternMatchCountry = PatternMatch(
+          "Address Line 2",
+          """\w""".r,
+          where = Option("id < 5"),
+          analyzerOptions =
+            Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.NULL))
+        )
+        val state = patternMatchCountry.computeStateFrom(data)
+        val metric: DoubleMetric with FullColumn =
+          patternMatchCountry.computeMetricFrom(state)
+
+        data
+          .withColumn("new", metric.fullColumn.get)
+          .collect()
+          .map(_.getAs[Any]("new")) shouldBe
+          Seq(true, true, true, true, false, null, null, null)
+    }
 
-      val data = getDfWithStringColumns(session)
+    "correctly identify valid and invalid US SSNs" in withSparkSession {
+      session =>
+        val data = getDfWithPatternMatch(session)
+
+        val patternMatchSSN = PatternMatch(
+          "SSN",
+          SSN_US,
+          analyzerOptions =
+            Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))
+        )
+        val state = patternMatchSSN.computeStateFrom(data)
+        val metric: DoubleMetric with FullColumn =
+          patternMatchSSN.computeMetricFrom(state)
+
+        data
+          .withColumn("SSN_Match", metric.fullColumn.get)
+          .collect()
+          .map(_.getAs[Any]("SSN_Match")) shouldBe
+          Seq(true, true, false, false, false, false, false, true, true, false,
+            true, false)
+    }
 
-      val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r, where = Option("id < 5"),
-        analyzerOptions = Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.NULL)))
-      val state = patternMatchCountry.computeStateFrom(data)
-      val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state)
+    "correctly identify valid and invalid US phone numbers" in withSparkSession {
+      session =>
+        val data = getDfWithPatternMatch(session)
+
+        val patternMatchPhone = PatternMatch(
+          "PhoneNumber",
+          PHONE_NUMBER_US,
+          analyzerOptions =
+            Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))
+        )
+        val state = patternMatchPhone.computeStateFrom(data)
+        val metric: DoubleMetric with FullColumn =
+          patternMatchPhone.computeMetricFrom(state)
+
+        data
+          .withColumn("Phone_Match", metric.fullColumn.get)
+          .collect()
+          .map(_.getAs[Any]("Phone_Match")) shouldBe
+          Seq(true, true, true, true, true, true, true, true, true, true, false,
+            false)
+    }
 
-      data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe
-        Seq(true, true, true, true, false, null, null, null)
+    "correctly identify valid and invalid US postal codes" in withSparkSession {
+      session =>
+        val data = getDfWithPatternMatch(session)
+
+        val patternMatchPostalCode = PatternMatch(
+          "PostalCode",
+          POSTAL_CODE_US,
+          analyzerOptions =
+            Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))
+        )
+        val state = patternMatchPostalCode.computeStateFrom(data)
+        val metric: DoubleMetric with FullColumn =
+          patternMatchPostalCode.computeMetricFrom(state)
+
+        data
+          .withColumn("PostalCode_Match", metric.fullColumn.get)
+          .collect()
+          .map(_.getAs[Any]("PostalCode_Match")) shouldBe
+          Seq(true, true, true, true, true, false, true, false, false, true,
+            true, false)
     }
+
   }
 }
diff --git a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala
@@ -466,7 +466,26 @@ trait FixtureSupport {
       ("bar", 20)
     ).toDF("name", "age")
   }
-
+
+  def getDfWithPatternMatch(sparkSession: SparkSession): DataFrame = {
+  import sparkSession.implicits._
+
+  Seq(
+    // SSN                 PostalCode    PhoneNumber
+    ("123-45-6789",       "12345",      "+1 234-567-8901"),       // All valid
+    ("078-05-1120",       "12345-6789", "(234) 567-8901 x1234"),  // All valid with extension
+    ("000-12-3456",       "12345",      "234-567-8901"),          // Invalid SSN (000 area), valid others
+    ("666-45-6789",       "54321",      "1234567890"),            // Invalid SSN (666 area), valid phone
+    ("900-12-3456",       "54321-1234", "+1 (234) 567 8901"),     // Invalid SSN (900 range), valid postal and phone
+    ("123-45-0000",       "ABCDE",      "+1 234 567-8901"),       // Invalid SSN (0000 serial), invalid postal code
+    ("123-00-6789",       "54321",      "123-45-6789"),           // Invalid SSN (00 group), phone looks like SSN
+    ("123-45-6789",       "123",        "234-567-8901"),          // Valid SSN, invalid postal (too short)
+    ("123-45-6789",       "123456789",  "+1 234 567 8901 ext1234"),// Valid SSN, valid phone with extension, invalid postal (no dash)
+    ("999-99-9999",       "54321-6789", "234 567 8901"),          // Invalid SSN (invalid area code 999), valid postal, valid phone
+    ("123-45-6789",       "54321",      "1234"),                  // Valid SSN, valid postal, invalid phone (too short)
+    ("",                  "",           "")                       // Empty row, testing blanks
+  ).toDF("SSN", "PostalCode", "PhoneNumber")
+}
   def getFakeNumericColumnProfileWithMinMaxMeanAndStdDev(
     columnName: String,
     completeness: Double,