From 18b98c56125442bc2f5a167e46db085dbf4c2741 Mon Sep 17 00:00:00 2001 From: jhorner Date: Sat, 7 Sep 2024 19:45:03 -0600 Subject: [PATCH] Add Unit Tests for Pattern Match --- .../deequ/analyzers/PatternMatchTest.scala | 257 +++++++++++++----- .../amazon/deequ/utils/FixtureSupport.scala | 21 +- 2 files changed, 203 insertions(+), 75 deletions(-) diff --git a/src/test/scala/com/amazon/deequ/analyzers/PatternMatchTest.scala b/src/test/scala/com/amazon/deequ/analyzers/PatternMatchTest.scala index 94d43967..dd00a723 100644 --- a/src/test/scala/com/amazon/deequ/analyzers/PatternMatchTest.scala +++ b/src/test/scala/com/amazon/deequ/analyzers/PatternMatchTest.scala @@ -1,18 +1,16 @@ -/** - * Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"). You may not - * use this file except in compliance with the License. A copy of the License - * is located at - * - * http://aws.amazon.com/apache2.0/ - * - * or in the "license" file accompanying this file. This file is distributed on - * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either - * express or implied. See the License for the specific language governing - * permissions and limitations under the License. - * - */ +/** Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may not + * use this file except in compliance with the License. A copy of the License + * is located at + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ package com.amazon.deequ.analyzers @@ -22,82 +20,193 @@ import com.amazon.deequ.utils.FixtureSupport import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec -class PatternMatchTest extends AnyWordSpec with Matchers with SparkContextSpec with FixtureSupport { +class PatternMatchTest + extends AnyWordSpec + with Matchers + with SparkContextSpec + with FixtureSupport { "PatternMatch" should { - "return row-level results for non-null columns" in withSparkSession { session => - - val data = getDfWithStringColumns(session) - - val patternMatchCountry = PatternMatch("Address Line 1", """\d""".r) - val state = patternMatchCountry.computeStateFrom(data) - val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state) - - data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe - Seq(true, true, true, true, true, true, true, true) + "return row-level results for non-null columns" in withSparkSession { + session => + val data = getDfWithStringColumns(session) + + val patternMatchCountry = PatternMatch("Address Line 1", """\d""".r) + val state = patternMatchCountry.computeStateFrom(data) + val metric: DoubleMetric with FullColumn = + patternMatchCountry.computeMetricFrom(state) + + data + .withColumn("new", metric.fullColumn.get) + .collect() + .map(_.getAs[Any]("new")) shouldBe + Seq(true, true, true, true, true, true, true, true) } - "return row-level results for non-null columns starts with digit" in withSparkSession { session => - - val data = getDfWithStringColumns(session) - - val patternMatchCountry = PatternMatch("Address Line 1", """(^[0-4])""".r) - val state = patternMatchCountry.computeStateFrom(data) - val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state) - - data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe - Seq(false, false, true, true, false, false, true, true) + "return row-level results for non-null columns starts with digit" in withSparkSession { + session => + val data = getDfWithStringColumns(session) + + val patternMatchCountry = + PatternMatch("Address Line 1", """(^[0-4])""".r) + val state = patternMatchCountry.computeStateFrom(data) + val metric: DoubleMetric with FullColumn = + patternMatchCountry.computeMetricFrom(state) + + data + .withColumn("new", metric.fullColumn.get) + .collect() + .map(_.getAs[Any]("new")) shouldBe + Seq(false, false, true, true, false, false, true, true) } - "return row-level results for non-null columns starts with digit filtered as true" in withSparkSession { session => - - val data = getDfWithStringColumns(session) - - val patternMatchCountry = PatternMatch("Address Line 1", """(^[0-4])""".r, where = Option("id < 5"), - analyzerOptions = Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))) - val state = patternMatchCountry.computeStateFrom(data) - val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state) - - data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe - Seq(false, false, true, true, false, true, true, true) + "return row-level results for non-null columns starts with digit filtered as true" in withSparkSession { + session => + val data = getDfWithStringColumns(session) + + val patternMatchCountry = PatternMatch( + "Address Line 1", + """(^[0-4])""".r, + where = Option("id < 5"), + analyzerOptions = + Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE)) + ) + val state = patternMatchCountry.computeStateFrom(data) + val metric: DoubleMetric with FullColumn = + patternMatchCountry.computeMetricFrom(state) + + data + .withColumn("new", metric.fullColumn.get) + .collect() + .map(_.getAs[Any]("new")) shouldBe + Seq(false, false, true, true, false, true, true, true) } - "return row-level results for columns with nulls" in withSparkSession { session => + "return row-level results for columns with nulls" in withSparkSession { + session => + val data = getDfWithStringColumns(session) - val data = getDfWithStringColumns(session) + val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r) + val state = patternMatchCountry.computeStateFrom(data) + val metric: DoubleMetric with FullColumn = + patternMatchCountry.computeMetricFrom(state) - val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r) - val state = patternMatchCountry.computeStateFrom(data) - val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state) - - data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe - Seq(true, true, true, true, false, true, true, false) + data + .withColumn("new", metric.fullColumn.get) + .collect() + .map(_.getAs[Any]("new")) shouldBe + Seq(true, true, true, true, false, true, true, false) } - "return row-level results for columns with nulls filtered as true" in withSparkSession { session => - - val data = getDfWithStringColumns(session) - - val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r, where = Option("id < 5"), - analyzerOptions = Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE))) - val state = patternMatchCountry.computeStateFrom(data) - val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state) - - data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe - Seq(true, true, true, true, false, true, true, true) + "return row-level results for columns with nulls filtered as true" in withSparkSession { + session => + val data = getDfWithStringColumns(session) + + val patternMatchCountry = PatternMatch( + "Address Line 2", + """\w""".r, + where = Option("id < 5"), + analyzerOptions = + Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE)) + ) + val state = patternMatchCountry.computeStateFrom(data) + val metric: DoubleMetric with FullColumn = + patternMatchCountry.computeMetricFrom(state) + + data + .withColumn("new", metric.fullColumn.get) + .collect() + .map(_.getAs[Any]("new")) shouldBe + Seq(true, true, true, true, false, true, true, true) } - "return row-level results for columns with nulls filtered as null" in withSparkSession { session => + "return row-level results for columns with nulls filtered as null" in withSparkSession { + session => + val data = getDfWithStringColumns(session) + + val patternMatchCountry = PatternMatch( + "Address Line 2", + """\w""".r, + where = Option("id < 5"), + analyzerOptions = + Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.NULL)) + ) + val state = patternMatchCountry.computeStateFrom(data) + val metric: DoubleMetric with FullColumn = + patternMatchCountry.computeMetricFrom(state) + + data + .withColumn("new", metric.fullColumn.get) + .collect() + .map(_.getAs[Any]("new")) shouldBe + Seq(true, true, true, true, false, null, null, null) + } - val data = getDfWithStringColumns(session) + "correctly identify valid and invalid US SSNs" in withSparkSession { + session => + val data = getDfWithPatternMatch(session) + + val patternMatchSSN = PatternMatch( + "SSN", + SSN_US, + analyzerOptions = + Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE)) + ) + val state = patternMatchSSN.computeStateFrom(data) + val metric: DoubleMetric with FullColumn = + patternMatchSSN.computeMetricFrom(state) + + data + .withColumn("SSN_Match", metric.fullColumn.get) + .collect() + .map(_.getAs[Any]("SSN_Match")) shouldBe + Seq(true, true, false, false, false, false, false, true, true, false, + true, false) + } - val patternMatchCountry = PatternMatch("Address Line 2", """\w""".r, where = Option("id < 5"), - analyzerOptions = Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.NULL))) - val state = patternMatchCountry.computeStateFrom(data) - val metric: DoubleMetric with FullColumn = patternMatchCountry.computeMetricFrom(state) + "correctly identify valid and invalid US phone numbers" in withSparkSession { + session => + val data = getDfWithPatternMatch(session) + + val patternMatchPhone = PatternMatch( + "PhoneNumber", + PHONE_NUMBER_US, + analyzerOptions = + Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE)) + ) + val state = patternMatchPhone.computeStateFrom(data) + val metric: DoubleMetric with FullColumn = + patternMatchPhone.computeMetricFrom(state) + + data + .withColumn("Phone_Match", metric.fullColumn.get) + .collect() + .map(_.getAs[Any]("Phone_Match")) shouldBe + Seq(true, true, true, true, true, true, true, true, true, true, false, + false) + } - data.withColumn("new", metric.fullColumn.get).collect().map(_.getAs[Any]("new")) shouldBe - Seq(true, true, true, true, false, null, null, null) + "correctly identify valid and invalid US postal codes" in withSparkSession { + session => + val data = getDfWithPatternMatch(session) + + val patternMatchPostalCode = PatternMatch( + "PostalCode", + POSTAL_CODE_US, + analyzerOptions = + Option(AnalyzerOptions(filteredRow = FilteredRowOutcome.TRUE)) + ) + val state = patternMatchPostalCode.computeStateFrom(data) + val metric: DoubleMetric with FullColumn = + patternMatchPostalCode.computeMetricFrom(state) + + data + .withColumn("PostalCode_Match", metric.fullColumn.get) + .collect() + .map(_.getAs[Any]("PostalCode_Match")) shouldBe + Seq(true, true, true, true, true, false, true, false, false, true, + true, false) } + } } diff --git a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala index 3a0866d2..ec9058cd 100644 --- a/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala +++ b/src/test/scala/com/amazon/deequ/utils/FixtureSupport.scala @@ -466,7 +466,26 @@ trait FixtureSupport { ("bar", 20) ).toDF("name", "age") } - + + def getDfWithPatternMatch(sparkSession: SparkSession): DataFrame = { + import sparkSession.implicits._ + + Seq( + // SSN PostalCode PhoneNumber + ("123-45-6789", "12345", "+1 234-567-8901"), // All valid + ("078-05-1120", "12345-6789", "(234) 567-8901 x1234"), // All valid with extension + ("000-12-3456", "12345", "234-567-8901"), // Invalid SSN (000 area), valid others + ("666-45-6789", "54321", "1234567890"), // Invalid SSN (666 area), valid phone + ("900-12-3456", "54321-1234", "+1 (234) 567 8901"), // Invalid SSN (900 range), valid postal and phone + ("123-45-0000", "ABCDE", "+1 234 567-8901"), // Invalid SSN (0000 serial), invalid postal code + ("123-00-6789", "54321", "123-45-6789"), // Invalid SSN (00 group), phone looks like SSN + ("123-45-6789", "123", "234-567-8901"), // Valid SSN, invalid postal (too short) + ("123-45-6789", "123456789", "+1 234 567 8901 ext1234"),// Valid SSN, valid phone with extension, invalid postal (no dash) + ("999-99-9999", "54321-6789", "234 567 8901"), // Invalid SSN (invalid area code 999), valid postal, valid phone + ("123-45-6789", "54321", "1234"), // Valid SSN, valid postal, invalid phone (too short) + ("", "", "") // Empty row, testing blanks + ).toDF("SSN", "PostalCode", "PhoneNumber") +} def getFakeNumericColumnProfileWithMinMaxMeanAndStdDev( columnName: String, completeness: Double,