From 40a4999722c26cc99b9a674f08b5bd2bdd2b1d7e Mon Sep 17 00:00:00 2001 From: arianna Date: Fri, 1 May 2020 11:14:43 -0600 Subject: [PATCH 01/29] adding stagedDate to both A&E indices --- .../resources/mappings/analysis_error_collectionIndex.json | 4 ++++ .../main/resources/mappings/analysis_error_granuleIndex.json | 4 ++++ .../java/org/cedar/onestop/indexer/util/IndexingUtils.java | 1 + 3 files changed, 9 insertions(+) diff --git a/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json index 8d080c175..3080898f1 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json @@ -2,6 +2,10 @@ "mappings": { "dynamic": "strict", "properties": { + "stagedDate": { + "type": "date", + "format": "epoch_millis" + }, "dataAccess": { "properties": { "dataAccessExists": { diff --git a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json index 8d080c175..3080898f1 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json @@ -2,6 +2,10 @@ "mappings": { "dynamic": "strict", "properties": { + "stagedDate": { + "type": "date", + "format": "epoch_millis" + }, "dataAccess": { "properties": { "dataAccessExists": { diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java index 67ff8b2b5..01a9a14e9 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java @@ -77,6 +77,7 @@ public static DocWriteRequest buildAnalysisAndErrorWriteRequest(String indexN else { var formattedRecord = new HashMap(); formattedRecord.putAll(TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); + formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); } } From 86363f709d68dc379f039c3f01a53854d686f97b Mon Sep 17 00:00:00 2001 From: arianna Date: Fri, 1 May 2020 11:19:29 -0600 Subject: [PATCH 02/29] adding internalParentIdentifier to granule A&E index --- .../main/resources/mappings/analysis_error_granuleIndex.json | 3 +++ .../org/cedar/onestop/indexer/util/TransformationUtils.java | 1 + 2 files changed, 4 insertions(+) diff --git a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json index 3080898f1..6f9ca09ce 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json @@ -6,6 +6,9 @@ "type": "date", "format": "epoch_millis" }, + "internalParentIdentifier": { + "type": "keyword" + }, "dataAccess": { "properties": { "dataAccessExists": { diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index a38b130ae..eb279acc8 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -32,6 +32,7 @@ public static Map reformatMessageForAnalysisAndErrors(ParsedReco var errors = record.getErrors(); var analysisMap = AvroUtils.avroToMap(analysis, true); + analysisMap.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); var errorsList = errors.stream() .map(e -> AvroUtils.avroToMap(e)) .collect(Collectors.toList()); From 589b77e0cd16dd2601682af3534817a8db8b4323 Mon Sep 17 00:00:00 2001 From: arianna Date: Mon, 11 May 2020 16:57:21 -0600 Subject: [PATCH 03/29] now isGranule instead of matchesIdentifiers, which is more clear --- buildSrc/src/main/kotlin/utils.kt | 2 +- .../analysis_error_collectionIndex.json | 2 +- .../mappings/analysis_error_granuleIndex.json | 2 +- .../onestop/indexer/util/ValidationUtils.java | 34 +++-- .../indexer/util/ValidationUtilsSpec.groovy | 133 +++++++++--------- 5 files changed, 87 insertions(+), 86 deletions(-) diff --git a/buildSrc/src/main/kotlin/utils.kt b/buildSrc/src/main/kotlin/utils.kt index abcfdf687..1a919da23 100644 --- a/buildSrc/src/main/kotlin/utils.kt +++ b/buildSrc/src/main/kotlin/utils.kt @@ -40,7 +40,7 @@ object Versions { const val PAC4J = "3.8.3" const val SNAKE_YAML = "1.24" - const val ONESTOP_SCHEMAS: String = "0.5.5" + const val ONESTOP_SCHEMAS: String = "analysis-updates-SNAPSHOT" } // data classes diff --git a/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json index 3080898f1..6b290290d 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json @@ -58,7 +58,7 @@ "hierarchyLevelNameExists": { "type": "boolean" }, - "matchesIdentifiers": { + "isGranule": { "type": "boolean" }, "parentIdentifierExists": { diff --git a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json index 6f9ca09ce..487ab98dd 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json @@ -61,7 +61,7 @@ "hierarchyLevelNameExists": { "type": "boolean" }, - "matchesIdentifiers": { + "isGranule": { "type": "boolean" }, "parentIdentifierExists": { diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/ValidationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/ValidationUtils.java index 356635fc7..e42609778 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/ValidationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/ValidationUtils.java @@ -39,7 +39,7 @@ public static ParsedRecord addValidationErrors(ValueWithTopic valu return ParsedRecord.newBuilder(record).setErrors(errors).build(); } - private static List validateRootRecord(ParsedRecord record) { + public static List validateRootRecord(ParsedRecord record) { var result = new ArrayList(); if (record.getDiscovery() == null || record.getDiscovery() == Discovery.newBuilder().build()) { result.add(buildValidationError("Discovery metadata missing. No metadata to load into OneStop.")); @@ -50,7 +50,7 @@ private static List validateRootRecord(ParsedRecord record) { return result; } - private static List validateIdentification(ParsedRecord record) { + public static List validateIdentification(ParsedRecord record) { var result = new ArrayList(); var identification = record.getAnalysis().getIdentification(); if (identification != null && !identification.getFileIdentifierExists() && !identification.getDoiExists()) { @@ -59,13 +59,10 @@ private static List validateIdentification(ParsedRecord record) { if (record.getType() == null ) { result.add(buildValidationError("Metadata type error -- type unknown.")); } - if (identification != null && !identification.getMatchesIdentifiers()) { - result.add(buildValidationError("Metadata type error -- hierarchyLevelName is 'granule' but no parentIdentifier provided.")); - } return result; } - private static List validateTopicPlacement(ParsedRecord record, String topic) { + public static List validateTopicPlacement(ParsedRecord record, String topic) { var result = new ArrayList(); var declaredRecordType = record.getType(); var recordTypeForTopic = IndexingUtils.determineTypeFromTopic(topic); @@ -77,19 +74,28 @@ private static List validateTopicPlacement(ParsedRecord record, Stri } var identification = record.getAnalysis().getIdentification(); - var isGranule = identification.getParentIdentifierExists() && identification.getHierarchyLevelNameExists() - && record.getDiscovery().getHierarchyLevelName().toLowerCase().equals("granule"); - if(isGranule && recordTypeForTopic != RecordType.granule) { + var hlm = record.getDiscovery().getHierarchyLevelName(); + // Granule on collection topic + if(identification != null && identification.getIsGranule() && recordTypeForTopic != RecordType.granule) { result.add(buildValidationError("Metadata indicates granule type but record is not on granule topic.")); } - if(!isGranule && recordTypeForTopic == RecordType.granule) { + // Non-granule on granule topic + if(identification != null && !identification.getIsGranule() && recordTypeForTopic == RecordType.granule) { result.add(buildValidationError("Metadata indicates non-granule type but record is on granule topic.")); + if(!identification.getParentIdentifierExists()) { + result.add(buildValidationError("Expected granule record but missing parentIdentifier.")); + } + if(!identification.getHierarchyLevelNameExists()) { + result.add(buildValidationError("Expected granule record but missing hierarchyLevelName. This must be present and equal to case-insensitive 'granule'.")); + } + if(identification.getHierarchyLevelNameExists() && !hlm.toLowerCase().equals("granule")) { + result.add(buildValidationError("Expected granule record but hierarchyLevelName is [ " + hlm + " ] and should be case-insensitive 'granule'.")); + } } - return result; } - private static List validateTitles(ParsedRecord record) { + public static List validateTitles(ParsedRecord record) { var result = new ArrayList(); var titles = record.getAnalysis().getTitles(); if (!titles.getTitleExists()) { @@ -98,7 +104,7 @@ private static List validateTitles(ParsedRecord record) { return result; } - private static List validateTemporalBounds(ParsedRecord record) { + public static List validateTemporalBounds(ParsedRecord record) { var result = new ArrayList(); var temporal = record.getAnalysis().getTemporalBounding(); if (temporal.getBeginDescriptor() == INVALID) { @@ -113,7 +119,7 @@ private static List validateTemporalBounds(ParsedRecord record) { return result; } - private static List validateSpatialBounds(ParsedRecord record) { + public static List validateSpatialBounds(ParsedRecord record) { var result = new ArrayList(); var spatial = record.getAnalysis().getSpatialBounding(); if (spatial.getSpatialBoundingExists() && !spatial.getIsValid()) { diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/ValidationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/ValidationUtilsSpec.groovy index d782f095c..c1894c1c0 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/ValidationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/ValidationUtilsSpec.groovy @@ -20,17 +20,13 @@ import static org.cedar.schemas.avro.psi.ValidDescriptor.VALID @Unroll class ValidationUtilsSpec extends Specification { - MockProcessorContext mockProcessorContext - TopicIdentifier ti - - def setup() { - mockProcessorContext = new MockProcessorContext() + def "valid message passes validation check"() { + given: + MockProcessorContext mockProcessorContext = new MockProcessorContext() mockProcessorContext.setTopic(TestUtils.collectionTopic) - ti = new TopicIdentifier<>() + TopicIdentifier ti = new TopicIdentifier<>() ti.init(mockProcessorContext) - } - def "valid message passes validation check"() { when: ValueWithTopic testInput = ti.transform(TestUtils.inputAvroRecord) @@ -39,6 +35,12 @@ class ValidationUtilsSpec extends Specification { } def "validation passes tombstones through"() { + given: + MockProcessorContext mockProcessorContext = new MockProcessorContext() + mockProcessorContext.setTopic(TestUtils.collectionTopic) + TopicIdentifier ti = new TopicIdentifier<>() + ti.init(mockProcessorContext) + when: ValueWithTopic testInput = ti.transform(null) @@ -47,16 +49,15 @@ class ValidationUtilsSpec extends Specification { } def "validates titles when #testCase"() { - def titleAnalysis = TitleAnalysis.newBuilder(TestUtils.inputAvroRecord.analysis.titles).setTitleExists(titleExists).build() - def analysis = Analysis.newBuilder(TestUtils.inputAvroRecord.analysis).setTitles(titleAnalysis).build() - def record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord).setAnalysis(analysis).build() + def titleAnalysis = TitleAnalysis.newBuilder().setTitleExists(titleExists).build() + def analysis = Analysis.newBuilder().setTitles(titleAnalysis).build() + def record = ParsedRecord.newBuilder().setAnalysis(analysis).build() when: - ValueWithTopic testInput = ti.transform(record) - def validated = ValidationUtils.addValidationErrors(testInput) + def errors = ValidationUtils.validateTitles(record) then: - validated.errors.isEmpty() == isValid + errors.isEmpty() == isValid where: testCase | isValid | titleExists @@ -65,106 +66,100 @@ class ValidationUtilsSpec extends Specification { } def "validates identification when #testCase"() { - def identificationAnalysis = IdentificationAnalysis.newBuilder(TestUtils.inputAvroRecord.analysis.identification) + def identificationAnalysis = IdentificationAnalysis.newBuilder() .setFileIdentifierExists(hasFileId) .setDoiExists(hasDoi) - .setMatchesIdentifiers(matches) .build() - def analysis = Analysis.newBuilder(TestUtils.inputAvroRecord.analysis).setIdentification(identificationAnalysis).build() - def record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord).setAnalysis(analysis).build() + def analysis = Analysis.newBuilder().setIdentification(identificationAnalysis).build() + def record = ParsedRecord.newBuilder().setType(type).setAnalysis(analysis)build() when: - ValueWithTopic testInput = ti.transform(record) - def validated = ValidationUtils.addValidationErrors(testInput) + def errors = ValidationUtils.validateIdentification(record) then: - validated.errors.size() == errors + errors.size() == errorCount where: - testCase | errors | hasFileId | hasDoi | matches - "has only fileId" | 0 | true | false | true - "has only doi" | 0 | false | true | true - "has no fileId nor doi" | 1 | false | false | true - "has mismatched type" | 1 | true | true | false - "no id and mismatched" | 2 | false | false | false + testCase | errorCount | hasFileId | hasDoi | type + "has only fileId" | 0 | true | false | RecordType.collection + "has only doi" | 0 | false | true | RecordType.granule + "has fileId and doi" | 0 | true | true | RecordType.collection + "has no ids" | 1 | false | false | RecordType.granule + "has unknown type" | 1 | true | true | null + "has no ids and unknown type" | 2 | false | false | null } def "validates temporal bounds when #testCase"() { - def temporalAnalysis = TemporalBoundingAnalysis.newBuilder(TestUtils.inputAvroRecord.analysis.temporalBounding) + def temporalAnalysis = TemporalBoundingAnalysis.newBuilder() .setBeginDescriptor(beginValid ? VALID : INVALID) .setEndDescriptor(endValid ? VALID : INVALID) .setInstantDescriptor(instantValid ? VALID : INVALID) .build() - def analysis = Analysis.newBuilder(TestUtils.inputAvroRecord.analysis).setTemporalBounding(temporalAnalysis).build() - def record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord).setAnalysis(analysis).build() + def analysis = Analysis.newBuilder().setTemporalBounding(temporalAnalysis).build() + def record = ParsedRecord.newBuilder().setAnalysis(analysis).build() when: - ValueWithTopic testInput = ti.transform(record) - def validated = ValidationUtils.addValidationErrors(testInput) + def errors = ValidationUtils.validateTemporalBounds(record) then: - validated.errors.size() == errors + errors.size() == errorCount where: - testCase | errors | beginValid| endValid| instantValid - "has valid bounds" | 0 | true | true | true - "has invalid start" | 1 | false | true | true - "has invalid end" | 1 | true | false | true - "has invalid start and end" | 2 | false | false | true - "is invalid instant" | 1 | true | true | false - "is completely invalid" | 3 | false | false | false + testCase | errorCount | beginValid| endValid| instantValid + "has valid bounds" | 0 | true | true | true + "has invalid start" | 1 | false | true | true + "has invalid end" | 1 | true | false | true + "has invalid start and end" | 2 | false | false | true + "is invalid instant" | 1 | true | true | false + "is completely invalid" | 3 | false | false | false } def "validates spatial bounds when #testCase"() { - def spatialAnalysis = SpatialBoundingAnalysis.newBuilder(TestUtils.inputAvroRecord.analysis.spatialBounding) + def spatialAnalysis = SpatialBoundingAnalysis.newBuilder() .setSpatialBoundingExists(exists) .setIsValid(valid) .build() - def analysis = Analysis.newBuilder(TestUtils.inputAvroRecord.analysis).setSpatialBounding(spatialAnalysis).build() - def record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord).setAnalysis(analysis).build() + def analysis = Analysis.newBuilder().setSpatialBounding(spatialAnalysis).build() + def record = ParsedRecord.newBuilder().setAnalysis(analysis).build() when: - ValueWithTopic testInput = ti.transform(record) - def validated = ValidationUtils.addValidationErrors(testInput) + def errors = ValidationUtils.validateSpatialBounds(record) then: - validated.errors.size() == errors + errors.size() == errorCount where: - testCase | errors | exists | valid - "bounds are valid" | 0 | true | true - "bounds are invalid" | 1 | true | false - "bounds not not exist" | 0 | false | false + testCase | errorCount | exists | valid + "bounds exist and are valid" | 0 | true | true + "bounds exist and are invalid" | 1 | true | false + "bounds do not exist" | 0 | false | true } def "validates topic placement when #testCase"() { - def identification = IdentificationAnalysis.newBuilder(TestUtils.inputAvroRecord.analysis.identification) + given: + def identification = IdentificationAnalysis.newBuilder() .setParentIdentifierExists(hasParentId) .setHierarchyLevelNameExists(hlm != null) + .setIsGranule(hasParentId && hlm != null && hlm.equals("granule")) .build() - def discovery = Discovery.newBuilder(TestUtils.inputAvroRecord.getDiscovery()).setHierarchyLevelName(hlm).build() - def analysis = Analysis.newBuilder(TestUtils.inputAvroRecord.analysis).setIdentification(identification).build() - def record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord).setType(type).setAnalysis(analysis).setDiscovery(discovery).build() - - // Setup places record on the collection topic, so we overwrite setup here - mockProcessorContext.setTopic(topic) - ti = new TopicIdentifier<>() - ti.init(mockProcessorContext) + def analysis = Analysis.newBuilder().setIdentification(identification).build() + def discovery = Discovery.newBuilder().setHierarchyLevelName(hlm).build() + def record = ParsedRecord.newBuilder().setType(type).setAnalysis(analysis).setDiscovery(discovery).build() when: - ValueWithTopic testInput = ti.transform(record) - def validated = ValidationUtils.addValidationErrors(testInput) + def errors = ValidationUtils.validateTopicPlacement(record, topic) then: - validated.errors.size() == errors + errors.size() == errorCount where: - testCase | errors | hasParentId | hlm | type | topic - "it's valid" | 0 | false | null | RecordType.collection | TestUtils.collectionTopic - "RecordType only doesn't match" | 1 | false | "collection" | RecordType.granule | TestUtils.collectionTopic - "granule on collection topic (metadata check)" | 1 | true | "granule" | RecordType.collection | TestUtils.collectionTopic - "non-granule on granule topic (metadata check)" | 1 | false | null | RecordType.granule | TestUtils.granuleTopic - "metadata check and RecordType check fail" | 2 | false | "collection" | RecordType.collection | TestUtils.granuleTopic - + testCase | errorCount | hasParentId | hlm | type | topic + "it's valid" | 0 | false | null | RecordType.collection | TestUtils.collectionTopic + "RecordType only doesn't match" | 1 | false | "collection" | RecordType.granule | TestUtils.collectionTopic + "granule on collection topic (metadata check)" | 1 | true | "granule" | RecordType.collection | TestUtils.collectionTopic + "non-granule on granule topic (no pid)" | 2 | false | "granule" | RecordType.granule | TestUtils.granuleTopic + "non-granule on granule topic (no hlm)" | 2 | true | null | RecordType.granule | TestUtils.granuleTopic + "non-granule on granule topic (no pid or hlm)" | 3 | false | null | RecordType.granule | TestUtils.granuleTopic + "metadata check and RecordType check fail" | 4 | false | "collection" | RecordType.collection | TestUtils.granuleTopic } } From 9c9cd28b67bc0b937e575a2aaca181b5dab9892f Mon Sep 17 00:00:00 2001 From: arianna Date: Fri, 15 May 2020 14:41:53 -0600 Subject: [PATCH 04/29] updating definition of INSTANT and comments --- .../cedar/onestop/indexer/util/TransformationUtils.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index eb279acc8..b13dc7199 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -15,6 +15,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.cedar.schemas.avro.psi.ValidDescriptor.UNDEFINED; import static org.cedar.schemas.avro.psi.ValidDescriptor.VALID; /** @@ -268,7 +269,7 @@ private static Map prepareDates(TemporalBounding bounding, Tempo var result = new HashMap(); // If bounding is actually an instant, set search fields accordingly - if (analysis.getRangeDescriptor() == TimeRangeDescriptor.INSTANT) { + if (analysis.getRangeDescriptor() == TimeRangeDescriptor.INSTANT && analysis.getBeginDescriptor() == UNDEFINED) { beginDate = analysis.getInstantUtcDateTimeString(); year = parseYear(beginDate); @@ -293,8 +294,8 @@ private static Map prepareDates(TemporalBounding bounding, Tempo beginYear = year; endYear = year; } else { - // If dates exist and are validSearchFormat (only false here if paleo, since we filtered out bad data earlier), - // use value from analysis block where dates are UTC datetime normalized + // If dates exist (thus VALID) and are indexable use value from analysis block where dates are UTC datetime normalized, + // else only set the year values as this is indicative of a paleo date beginDate = analysis.getBeginDescriptor() == VALID && analysis.getBeginIndexable() ? analysis.getBeginUtcDateTimeString() : null; beginYear = parseYear(analysis.getBeginUtcDateTimeString()); endDate = analysis.getEndDescriptor() == VALID && analysis.getEndIndexable() ? analysis.getEndUtcDateTimeString() : null; From 9c57833314e9623c4051644d06dc2497abaef249 Mon Sep 17 00:00:00 2001 From: arianna Date: Fri, 15 May 2020 17:21:04 -0600 Subject: [PATCH 05/29] suddenly salad! --- .../onestop/indexer/util/ValidationUtils.java | 112 ++++++++++---- .../indexer/util/ValidationUtilsSpec.groovy | 141 ++++++++++++++++-- 2 files changed, 212 insertions(+), 41 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/ValidationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/ValidationUtils.java index e42609778..ba8fb99e9 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/ValidationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/ValidationUtils.java @@ -9,8 +9,8 @@ import java.util.ArrayList; import java.util.List; -import static org.cedar.schemas.avro.psi.ValidDescriptor.INVALID; -import static org.cedar.schemas.avro.psi.ValidDescriptor.UNDEFINED; +import static org.cedar.schemas.avro.psi.ValidDescriptor.*; +import static org.cedar.schemas.avro.psi.TimeRangeDescriptor.*; /** * This class contains utilities for validating the contents of the Avro (schemas) records prior to indexing @@ -19,8 +19,6 @@ public class ValidationUtils { static final private Logger log = LoggerFactory.getLogger(ValidationUtils.class); - static final private String VALIDATION_ERROR_TITLE = "Invalid for search indexing"; - public static ParsedRecord addValidationErrors(ValueWithTopic value) { ParsedRecord record = value == null ? null : value.getValue(); if (record == null) { @@ -41,11 +39,13 @@ public static ParsedRecord addValidationErrors(ValueWithTopic valu public static List validateRootRecord(ParsedRecord record) { var result = new ArrayList(); - if (record.getDiscovery() == null || record.getDiscovery() == Discovery.newBuilder().build()) { - result.add(buildValidationError("Discovery metadata missing. No metadata to load into OneStop.")); + if (record.getDiscovery() == null || record.getDiscovery().equals(Discovery.newBuilder().build())) { + result.add(buildValidationError(ValidationError.ROOT, + "Discovery metadata missing -- no metadata to load into OneStop.")); } - if (record.getAnalysis() == null || record.getAnalysis() == Analysis.newBuilder().build()) { - result.add(buildValidationError("Analysis metadata missing. Cannot verify metadata quality for OneStop.")); + if (record.getAnalysis() == null || record.getAnalysis().equals(Analysis.newBuilder().build())) { + result.add(buildValidationError(ValidationError.ROOT, + "Analysis metadata missing -- cannot verify metadata quality for OneStop.")); } return result; } @@ -54,10 +54,12 @@ public static List validateIdentification(ParsedRecord record) { var result = new ArrayList(); var identification = record.getAnalysis().getIdentification(); if (identification != null && !identification.getFileIdentifierExists() && !identification.getDoiExists()) { - result.add(buildValidationError("Missing identifier - record contains neither a fileIdentifier nor a DOI")); + result.add(buildValidationError(ValidationError.IDENTIFICATION, + "Missing identifier -- record contains neither a fileIdentifier nor a DOI")); } if (record.getType() == null ) { - result.add(buildValidationError("Metadata type error -- type unknown.")); + result.add(buildValidationError(ValidationError.IDENTIFICATION, + "Metadata type error -- type unknown.")); } return result; } @@ -68,28 +70,34 @@ public static List validateTopicPlacement(ParsedRecord record, Strin var recordTypeForTopic = IndexingUtils.determineTypeFromTopic(topic); if(declaredRecordType != recordTypeForTopic) { - result.add(buildValidationError("Declared record type [ " + declaredRecordType.toString() + + result.add(buildValidationError(ValidationError.TYPE, + "Declared record type [ " + declaredRecordType.toString() + " ] does not match expected type [ " + recordTypeForTopic.toString() + - " ]. Metadata was ingested downstream into wrong topic.")); + " ]. Metadata was ingested upstream into wrong topic.")); } var identification = record.getAnalysis().getIdentification(); var hlm = record.getDiscovery().getHierarchyLevelName(); // Granule on collection topic if(identification != null && identification.getIsGranule() && recordTypeForTopic != RecordType.granule) { - result.add(buildValidationError("Metadata indicates granule type but record is not on granule topic.")); + result.add(buildValidationError(ValidationError.TYPE, + "Metadata indicates granule type but record is not on granule topic.")); } // Non-granule on granule topic if(identification != null && !identification.getIsGranule() && recordTypeForTopic == RecordType.granule) { - result.add(buildValidationError("Metadata indicates non-granule type but record is on granule topic.")); + result.add(buildValidationError(ValidationError.TYPE, + "Metadata indicates non-granule type but record is on granule topic.")); if(!identification.getParentIdentifierExists()) { - result.add(buildValidationError("Expected granule record but missing parentIdentifier.")); + result.add(buildValidationError(ValidationError.TYPE, + "Expected granule record but missing parentIdentifier.")); } if(!identification.getHierarchyLevelNameExists()) { - result.add(buildValidationError("Expected granule record but missing hierarchyLevelName. This must be present and equal to case-insensitive 'granule'.")); + result.add(buildValidationError(ValidationError.TYPE, + "Expected granule record but missing hierarchyLevelName. This must be present and equal to case-insensitive 'granule'.")); } if(identification.getHierarchyLevelNameExists() && !hlm.toLowerCase().equals("granule")) { - result.add(buildValidationError("Expected granule record but hierarchyLevelName is [ " + hlm + " ] and should be case-insensitive 'granule'.")); + result.add(buildValidationError(ValidationError.TYPE, + "Expected granule record but hierarchyLevelName is [ " + hlm + " ] and should be case-insensitive 'granule'.")); } } return result; @@ -99,23 +107,55 @@ public static List validateTitles(ParsedRecord record) { var result = new ArrayList(); var titles = record.getAnalysis().getTitles(); if (!titles.getTitleExists()) { - result.add(buildValidationError("Missing title")); + result.add(buildValidationError(ValidationError.TITLE, + "Missing title")); } return result; } public static List validateTemporalBounds(ParsedRecord record) { var result = new ArrayList(); - var temporal = record.getAnalysis().getTemporalBounding(); - if (temporal.getBeginDescriptor() == INVALID) { - result.add(buildValidationError("Invalid beginDate")); + var temporalAnalysis = record.getAnalysis().getTemporalBounding(); + + // No temporal information is okay + if (temporalAnalysis == null) { + return result; + } + + var range = temporalAnalysis.getRangeDescriptor(); + if (range == NOT_APPLICABLE) { + // Range is always NOT_APPLICABLE when there is an error in one or more individual date fields; temporalBounding + // access is null-safe here since an INVALID date only occurs with parsing errors + var temporalDiscovery = record.getDiscovery().getTemporalBounding(); + var begin = temporalDiscovery.getBeginDate(); + var end = temporalDiscovery.getEndDate(); + var instant = temporalDiscovery.getInstant(); + if (temporalAnalysis.getBeginDescriptor() == ValidDescriptor.INVALID) { + result.add(buildValidationError(ValidationError.TEMPORAL_FIELD, + "The beginDate [ " + begin + " ] could not be parsed.")); + } + if (temporalAnalysis.getEndDescriptor() == ValidDescriptor.INVALID) { + result.add(buildValidationError(ValidationError.TEMPORAL_FIELD, + "The endDate [ " + end + " ] could not be parsed.")); + } + if (temporalAnalysis.getInstantDescriptor() == ValidDescriptor.INVALID) { + result.add(buildValidationError(ValidationError.TEMPORAL_FIELD, + "The instant [ " + instant + " ] could not be parsed.")); + } } - if (temporal.getEndDescriptor() == INVALID) { - result.add(buildValidationError("Invalid endDate")); + else if (range == AMBIGUOUS) { + result.add(buildValidationError(ValidationError.TEMPORAL_RANGE, + "Ambiguous temporal bounding -- both an instant and a beginDate present, defining two valid ranges.")); } - if (temporal.getBeginDescriptor() != UNDEFINED && temporal.getEndDescriptor() != UNDEFINED && temporal.getInstantDescriptor() == INVALID) { - result.add(buildValidationError("Invalid instant-only date")); + else if (range == BACKWARDS) { + result.add(buildValidationError(ValidationError.TEMPORAL_RANGE, + "Backwards temporal bounding -- beginDate after endDate.")); } + else if (range == TimeRangeDescriptor.INVALID) { + result.add(buildValidationError(ValidationError.TEMPORAL_RANGE, + "Invalid temporal bounding -- endDate present without beginDate.")); + } + return result; } @@ -123,14 +163,30 @@ public static List validateSpatialBounds(ParsedRecord record) { var result = new ArrayList(); var spatial = record.getAnalysis().getSpatialBounding(); if (spatial.getSpatialBoundingExists() && !spatial.getIsValid()) { - result.add(buildValidationError("Invalid GeoJSON for spatial bounding")); + result.add(buildValidationError(ValidationError.SPATIAL, + "Invalid GeoJSON for spatial bounding")); } return result; } - private static ErrorEvent buildValidationError(String details) { + public enum ValidationError { + ROOT("Record Missing Major Component"), + IDENTIFICATION("Identification Error"), + TYPE("Type Error"), + TITLE("Title Error"), + TEMPORAL_FIELD("Temporal Bounding Field Error"), + TEMPORAL_RANGE("Temporal Bounding Range Error"), + SPATIAL("Spatial Bounding Error"); + + private final String title; + ValidationError(String title) { this.title = title; } + + String getTitle() { return title; } + } + + private static ErrorEvent buildValidationError(ValidationError errorCategory, String details) { return ErrorEvent.newBuilder() - .setTitle(VALIDATION_ERROR_TITLE) + .setTitle(errorCategory.getTitle()) .setDetail(details) .setSource(StreamsApps.INDEXER_ID) .build(); diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/ValidationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/ValidationUtilsSpec.groovy index c1894c1c0..32aba54df 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/ValidationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/ValidationUtilsSpec.groovy @@ -9,13 +9,18 @@ import org.cedar.schemas.avro.psi.IdentificationAnalysis import org.cedar.schemas.avro.psi.ParsedRecord import org.cedar.schemas.avro.psi.RecordType import org.cedar.schemas.avro.psi.SpatialBoundingAnalysis +import org.cedar.schemas.avro.psi.TemporalBounding import org.cedar.schemas.avro.psi.TemporalBoundingAnalysis +import org.cedar.schemas.avro.psi.TimeRangeDescriptor import org.cedar.schemas.avro.psi.TitleAnalysis +import org.cedar.schemas.avro.psi.ValidDescriptor import spock.lang.Specification import spock.lang.Unroll -import static org.cedar.schemas.avro.psi.ValidDescriptor.INVALID -import static org.cedar.schemas.avro.psi.ValidDescriptor.VALID +import static org.cedar.schemas.avro.psi.ValidDescriptor.*; +import static org.cedar.schemas.avro.psi.TimeRangeDescriptor.*; + +import static org.cedar.onestop.indexer.util.ValidationUtils.ValidationError.*; @Unroll class ValidationUtilsSpec extends Specification { @@ -48,6 +53,58 @@ class ValidationUtilsSpec extends Specification { ValidationUtils.addValidationErrors(testInput) == null } + def "null Discovery fails root validation"() { + def analysis = Analysis.newBuilder(TestUtils.inputAvroRecord.analysis).build() + def record = ParsedRecord.newBuilder().setAnalysis(analysis).build() + + when: + def errors = ValidationUtils.validateRootRecord(record) + + then: + record.discovery == null + errors.size() == 1 + errors[0].title.equals(ROOT.title) + } + + def "empty Discovery fails root validation"() { + def analysis = Analysis.newBuilder(TestUtils.inputAvroRecord.analysis).build() + def record = ParsedRecord.newBuilder().setDiscovery(Discovery.newBuilder().build()).setAnalysis(analysis).build() + + when: + def errors = ValidationUtils.validateRootRecord(record) + + then: + record.discovery == Discovery.newBuilder().build() + errors.size() == 1 + errors[0].title.equals(ROOT.title) + } + + def "null Analysis fails root validation"() { + def discovery = Discovery.newBuilder(TestUtils.inputAvroRecord.discovery).build() + def record = ParsedRecord.newBuilder().setDiscovery(discovery).build() + + when: + def errors = ValidationUtils.validateRootRecord(record) + + then: + record.analysis == null + errors.size() == 1 + errors[0].title.equals(ROOT.title) + } + + def "empty Analysis fails root validation"() { + def discovery = Discovery.newBuilder(TestUtils.inputAvroRecord.discovery).build() + def record = ParsedRecord.newBuilder().setDiscovery(discovery).setAnalysis(Analysis.newBuilder().build()).build() + + when: + def errors = ValidationUtils.validateRootRecord(record) + + then: + record.analysis == Analysis.newBuilder().build() + errors.size() == 1 + errors[0].title.equals(ROOT.title) + } + def "validates titles when #testCase"() { def titleAnalysis = TitleAnalysis.newBuilder().setTitleExists(titleExists).build() def analysis = Analysis.newBuilder().setTitles(titleAnalysis).build() @@ -59,6 +116,11 @@ class ValidationUtilsSpec extends Specification { then: errors.isEmpty() == isValid + and: + if(!isValid) { + errors.each({ e -> e.title.equals(TITLE.title) }) + } + where: testCase | isValid | titleExists "title is missing" | false | false @@ -79,6 +141,11 @@ class ValidationUtilsSpec extends Specification { then: errors.size() == errorCount + and: + if(errorCount > 0) { + errors.each({ e -> e.title.equals(IDENTIFICATION.title) }) + } + where: testCase | errorCount | hasFileId | hasDoi | type "has only fileId" | 0 | true | false | RecordType.collection @@ -89,11 +156,42 @@ class ValidationUtilsSpec extends Specification { "has no ids and unknown type" | 2 | false | false | null } - def "validates temporal bounds when #testCase"() { + def "validates temporal bounds by field when #testCase"() { + def temporalAnalysis = TemporalBoundingAnalysis.newBuilder() + .setBeginDescriptor(begin) + .setEndDescriptor(end) + .setInstantDescriptor(instant) + .setRangeDescriptor(NOT_APPLICABLE) // Forces traversal through field checks for all test cases + .build() + def analysis = Analysis.newBuilder().setTemporalBounding(temporalAnalysis).build() + // Need to supply content for Discovery here to avoid NPEs + def temporalBounding = TemporalBounding.newBuilder().setBeginDate("begin").setEndDate("end").setInstant("instant").build() + def discovery = Discovery.newBuilder().setTemporalBounding(temporalBounding).build() + def record = ParsedRecord.newBuilder().setAnalysis(analysis).setDiscovery(discovery).build() + + when: + def errors = ValidationUtils.validateTemporalBounds(record) + + then: + errors.size() == errorCount + + and: + if(errorCount > 0) { + errors.each({ e -> e.title.equals(TEMPORAL_FIELD.title) }) + } + + where: + testCase | errorCount | begin | end | instant + "all dates undefined" | 0 | ValidDescriptor.UNDEFINED | ValidDescriptor.UNDEFINED | ValidDescriptor.UNDEFINED + "all dates valid" | 0 | VALID | VALID | VALID + "has invalid begin" | 1 | INVALID | VALID | VALID + "has invalid end" | 1 | VALID | INVALID | VALID + "has invalid instant" | 1 | VALID | VALID | INVALID + } + + def "validates temporal bounds by range when #testCase"() { def temporalAnalysis = TemporalBoundingAnalysis.newBuilder() - .setBeginDescriptor(beginValid ? VALID : INVALID) - .setEndDescriptor(endValid ? VALID : INVALID) - .setInstantDescriptor(instantValid ? VALID : INVALID) + .setRangeDescriptor(range) .build() def analysis = Analysis.newBuilder().setTemporalBounding(temporalAnalysis).build() def record = ParsedRecord.newBuilder().setAnalysis(analysis).build() @@ -104,14 +202,21 @@ class ValidationUtilsSpec extends Specification { then: errors.size() == errorCount + and: + if(errorCount > 0) { + errors.each({ e -> e.title.equals(TEMPORAL_RANGE.title) }) + } + where: - testCase | errorCount | beginValid| endValid| instantValid - "has valid bounds" | 0 | true | true | true - "has invalid start" | 1 | false | true | true - "has invalid end" | 1 | true | false | true - "has invalid start and end" | 2 | false | false | true - "is invalid instant" | 1 | true | true | false - "is completely invalid" | 3 | false | false | false + testCase | errorCount | range + "has BOUNDED range" | 0 | BOUNDED + "has INSTANT range" | 0 | INSTANT + "has ONGOING range" | 0 | ONGOING + "has UNDEFINED range" | 0 | TimeRangeDescriptor.UNDEFINED + "has AMBIGUOUS range" | 1 | AMBIGUOUS + "has BACKWARDS range" | 1 | BACKWARDS + "has INVALID range" | 1 | TimeRangeDescriptor.INVALID + // NOT_APPLICABLE range does not generate TEMPORAL_RANGE error and is tested in validation by fields test } def "validates spatial bounds when #testCase"() { @@ -128,6 +233,11 @@ class ValidationUtilsSpec extends Specification { then: errors.size() == errorCount + and: + if(errorCount > 0) { + errors.each({ e -> e.title.equals(SPATIAL.title) }) + } + where: testCase | errorCount | exists | valid "bounds exist and are valid" | 0 | true | true @@ -152,6 +262,11 @@ class ValidationUtilsSpec extends Specification { then: errors.size() == errorCount + and: + if(errorCount > 0) { + errors.each({ e -> e.title.equals(TYPE.title) }) + } + where: testCase | errorCount | hasParentId | hlm | type | topic "it's valid" | 0 | false | null | RecordType.collection | TestUtils.collectionTopic From 9321f1ed9166dc9a4a5af2418f009f0c5a446ff4 Mon Sep 17 00:00:00 2001 From: Zeb Date: Wed, 20 May 2020 17:04:21 -0600 Subject: [PATCH 06/29] Update analysis and error mappings to prevent indexing failures due to strictness. --- .../resources/mappings/analysis_error_collectionIndex.json | 4 ++-- .../resources/mappings/analysis_error_granuleIndex.json | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json index 6b290290d..ba5e2d670 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json @@ -163,7 +163,7 @@ "errors": { "type": "nested", "properties": { - "applicationSource": { + "source": { "type": "keyword" }, "title": { @@ -176,4 +176,4 @@ } } } -} \ No newline at end of file +} diff --git a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json index 487ab98dd..591132b3a 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json @@ -66,6 +66,9 @@ }, "parentIdentifierExists": { "type": "boolean" + }, + "parentIdentifierString": { + "type": "text" } } }, @@ -166,7 +169,7 @@ "errors": { "type": "nested", "properties": { - "applicationSource": { + "source": { "type": "keyword" }, "title": { @@ -179,4 +182,4 @@ } } } -} \ No newline at end of file +} From 9c3f3356a27f4f79bfaf61d1c6127d803f350d8b Mon Sep 17 00:00:00 2001 From: Zeb Date: Tue, 26 May 2020 13:43:07 -0600 Subject: [PATCH 07/29] mess of half completed nested mapping comparison --- buildSrc/src/main/kotlin/utils.kt | 4 +- .../mappings/analysis_error_granuleIndex.json | 3 - .../onestop/indexer/util/IndexingInput.java | 18 +++++ .../onestop/indexer/util/IndexingUtils.java | 4 + .../indexer/util/TransformationUtils.java | 80 +++++++++++++++++++ .../util/TransformationUtilsSpec.groovy | 51 +++++++++++- 6 files changed, 154 insertions(+), 6 deletions(-) diff --git a/buildSrc/src/main/kotlin/utils.kt b/buildSrc/src/main/kotlin/utils.kt index 1a919da23..6e925dd10 100644 --- a/buildSrc/src/main/kotlin/utils.kt +++ b/buildSrc/src/main/kotlin/utils.kt @@ -40,7 +40,7 @@ object Versions { const val PAC4J = "3.8.3" const val SNAKE_YAML = "1.24" - const val ONESTOP_SCHEMAS: String = "analysis-updates-SNAPSHOT" + const val ONESTOP_SCHEMAS: String = "1250-date-parsing-exception-SNAPSHOT" } // data classes @@ -86,4 +86,4 @@ fun parseDateISO(date: String): Date { val timeFormatter: DateTimeFormatter = DateTimeFormatter.ISO_DATE_TIME val accessor: TemporalAccessor = timeFormatter.parse(date) return Date.from(Instant.from(accessor)) -} \ No newline at end of file +} diff --git a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json index 591132b3a..9a86b537c 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json @@ -46,9 +46,6 @@ } } }, - "fileIdentifierExists": { - "type": "boolean" - }, "fileIdentifierString": { "type": "text", "fields": { diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java index fcfe59659..f5ba13220 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java @@ -72,6 +72,24 @@ public Set getTargetAnalysisAndErrorsIndexFields() { } } + // public Map getTargetAnalysisAndErrorsIndexMapping() { + // var aeAlias = esConfig.analysisAndErrorsAliasFromType(recordType.toString()); + // if(aeAlias != null) { + // return esConfig.indexedProperties(aeAlias); + // } + // else { + // return new HashMap<>(); + // } + // } + + // public static Map getNestedKeys(Map originalMap) { + // if (keysToKeep == null || keysToKeep.size() == 0) { + // return new HashMap<>(); + // } + // return originalMap.entrySet().stream() + // .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + // } + @Override public String toString() { return "IndexingInput {" + diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java index 01a9a14e9..3be1903ed 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java @@ -64,6 +64,8 @@ public static DocWriteRequest buildSearchWriteRequest(String indexName, DocWr } else { var formattedRecord = new HashMap(); + // log.info("build search write request "+input.getValue().value()+ " and "+input.getTargetSearchIndexFields()); + // log.info("transforms to "+TransformationUtils.reformatMessageForSearch(input.getValue().value(), input.getTargetSearchIndexFields())); formattedRecord.putAll(TransformationUtils.reformatMessageForSearch(input.getValue().value(), input.getTargetSearchIndexFields())); formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); @@ -76,6 +78,8 @@ public static DocWriteRequest buildAnalysisAndErrorWriteRequest(String indexN } else { var formattedRecord = new HashMap(); + log.info("build A&E write request "+input.getValue().value() +" and "+ input.getTargetAnalysisAndErrorsIndexFields()); + log.info("transforms to "+TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); formattedRecord.putAll(TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index b13dc7199..dfcb0b03d 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -18,6 +18,9 @@ import static org.cedar.schemas.avro.psi.ValidDescriptor.UNDEFINED; import static org.cedar.schemas.avro.psi.ValidDescriptor.VALID; + +// TODO import org.apache.kafka.streams.StreamsBuilder; + /** * This class contains utilities for transforming the contents of the Avro (schemas) records into the appropriate * corresponding Elasticsearch mapping format. @@ -38,14 +41,91 @@ public static Map reformatMessageForAnalysisAndErrors(ParsedReco .map(e -> AvroUtils.avroToMap(e)) .collect(Collectors.toList()); + analysisMap.put("errors", errorsList); // drop fields not present in target index + // TODO make recursive! var result = new LinkedHashMap(targetFields.size()); targetFields.forEach(f -> result.put(f, analysisMap.get(f))); return result; } + public static Map unfilteredAEMessage(ParsedRecord record) { + var analysis = record.getAnalysis(); + var errors = record.getErrors(); + + var analysisMap = AvroUtils.avroToMap(analysis, true); + analysisMap.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); + var errorsList = errors.stream() + .map(e -> AvroUtils.avroToMap(e)) + .collect(Collectors.toList()); + + + var garbageError = new LinkedHashMap(); + garbageError.put("nonsense", "horrible"); + garbageError.put("source", "valid field" ); + errorsList.add(garbageError); + + + analysisMap.put("errors", errorsList); + analysisMap.put("garbage", "nuke meeee"); // FIXME + return analysisMap; + } + + public static Map stuffToRemove(Map analysisMap, Map mapping) { + var result = new LinkedHashMap(); + // analysisMap.entrySet().stream().forEach(e -> { + // if( !mapping.containsKey(e.getKey())) { + // result.put(e.getKey(), e.getValue()); + // } else { + // if (e.getValue() instanceof Map){ + // System.out.println("ZEB: the value is a map!"); + // // System.out.println("mapping: "+mapping.get(e.getKey()).get("properties")); + // // System.out.println("--> "+stuffToRemove((Map)e.getValue(), (Map)mapping.get(e.getKey()).get("properties"))); + // result.put(e.getKey(), stuffToRemove((Map)e.getValue(), (Map)((Map)mapping.get(e.getKey())).get("properties"))); // TODO brute force assumes mapping is an object map string:object here too + // } else if(e.getValue() instanceof Collection){ + // // TODO!!!! + // // result.put(e.getKey(), ((Collection)e.getValue()).filter(item -> !stuffToRemove((Map)item, (Map)((Map)mapping.get(e.getKey())).get("properties"))).isEmpty()); + // } + // } + // }); + // return result; + + analysisMap.forEach((k, v) -> { + if (!mapping.containsKey(k)) { + result.put(k, v); + } else { + Map nestedProperties = (Map)((Map)mapping.get(k)).get("properties"); // TODO assumes mapping is also a Map! + + if (v instanceof Map) { + result.put(k, stuffToRemove((Map) v, nestedProperties)); + } else if (v instanceof List) { + var list = ((List) v).stream().map(item -> stuffToRemove((Map) item, nestedProperties)).filter(item -> !((Map)item).isEmpty()) + .collect(Collectors.toList()); + System.out.println("ZEB - list: "+list); + result.put(k, list); + } + } + }); + return result; + } +/* + toRemove.forEach((k, v) -> { + var originalValue = mergedMap.get(k); + if (v instanceof Map && originalValue instanceof Map) { + mergedMap.put(k, removeFromMap((Map) originalValue, (Map) v)); + } + else if (v instanceof List && originalValue instanceof List) { + var mergedList = new HashSet<>((List) originalValue); + mergedList.removeAll((List) v); + mergedMap.put(k, mergedList); + } + else if ((v == null && originalValue == null) || v.equals(originalValue)) { + mergedMap.remove(k); + } + }); +*/ /////////////////////////////////////////////////////////////////////////////// // Indexing For Search // diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index 1859851ae..f9e6e5c6c 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -1,6 +1,10 @@ package org.cedar.onestop.indexer.util import org.cedar.schemas.analyze.Analyzers +import org.cedar.schemas.analyze.Temporal +import org.cedar.schemas.avro.psi.Analysis +import org.cedar.schemas.avro.psi.TemporalBoundingAnalysis +import org.cedar.schemas.avro.psi.ValidDescriptor import org.cedar.schemas.avro.psi.Discovery import org.cedar.schemas.avro.psi.FileInformation import org.cedar.schemas.avro.psi.ParsedRecord @@ -8,9 +12,12 @@ import org.cedar.schemas.avro.psi.RecordType import org.cedar.schemas.avro.psi.Relationship import org.cedar.schemas.avro.psi.RelationshipType import org.cedar.schemas.avro.psi.TemporalBounding +import java.time.temporal.ChronoUnit import spock.lang.Specification import spock.lang.Unroll +import groovy.json.JsonOutput + import static org.cedar.schemas.avro.util.TemporalTestData.getSituations @Unroll @@ -18,6 +25,7 @@ class TransformationUtilsSpec extends Specification { static collectionFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet() static granuleFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet() + static granuleAnalysisErrorFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS).keySet() static expectedKeywords = [ "SIO > Super Important Organization", @@ -83,6 +91,47 @@ class TransformationUtilsSpec extends Specification { 'granule' | granuleFields | TestUtils.inputGranuleRecord } + def "only mapped nested fields are indexed"() { + when: + def result = TransformationUtils.reformatMessageForAnalysisAndErrors(TestUtils.inputGranuleRecord, granuleAnalysisErrorFields) + + + def asdf = TransformationUtils.stuffToRemove(TransformationUtils.unfilteredAEMessage(TestUtils.inputGranuleRecord), TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) + + println("ZEB") + println(result) + println(JsonOutput.toJson(asdf)) + + then: + result.keySet().each({ assert granuleAnalysisErrorFields.contains(it) }) + } + + def "can i construct a record"() { + when: + ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) + .setAnalysis( + Analysis.newBuilder().setTemporalBounding( + TemporalBoundingAnalysis.newBuilder() + .setBeginDescriptor(ValidDescriptor.VALID) + .setBeginIndexable(true) + .setBeginPrecision(ChronoUnit.DAYS.toString()) + .setBeginZoneSpecified(null) + .setBeginUtcDateTimeString("2000-02-01") + .setBeginYear(2000) + .setBeginMonth(2) + .setBeginDayOfYear(32) + .setBeginDayOfMonth(1) + .build() + ).build()).build() + def asdf = TransformationUtils.stuffToRemove(TransformationUtils.unfilteredAEMessage(record), TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) + + println("ZEB") + println(JsonOutput.toJson(asdf)) + + then: + asdf.keySet().each({ assert granuleAnalysisErrorFields.contains(it) }) + } + //////////////////////////////// // Identifiers, "Names" // //////////////////////////////// @@ -266,7 +315,7 @@ class TransformationUtilsSpec extends Specification { def "temporal bounding with #testCase dates is prepared correctly"() { given: def bounding = TemporalBounding.newBuilder().setBeginDate(begin).setEndDate(end).build() - def analysis = Analyzers.analyzeTemporalBounding(Discovery.newBuilder().setTemporalBounding(bounding).build()) + def analysis = Temporal.analyzeBounding(Discovery.newBuilder().setTemporalBounding(bounding).build()) when: def result = TransformationUtils.prepareDates(bounding, analysis) From 21d3448a40ea3aef54a3f049faf49f8c350f3ce0 Mon Sep 17 00:00:00 2001 From: arianna Date: Wed, 27 May 2020 09:38:00 -0600 Subject: [PATCH 08/29] wip making new data-utils module --- data-utils/build.gradle | 14 ++ .../org/cedar/onestop/utils/ListUtils.java | 48 ++++++ .../org/cedar/onestop/utils/MapUtils.java | 155 ++++++++++++++++++ elastic-common/build.gradle | 2 + .../elastic/common/ElasticsearchConfig.java | 1 + .../onestop/kafka/common/util/DataUtils.java | 2 + settings.gradle.kts | 1 + 7 files changed, 223 insertions(+) create mode 100644 data-utils/build.gradle create mode 100644 data-utils/src/main/java/org/cedar/onestop/utils/ListUtils.java create mode 100644 data-utils/src/main/java/org/cedar/onestop/utils/MapUtils.java diff --git a/data-utils/build.gradle b/data-utils/build.gradle new file mode 100644 index 000000000..96c856cb7 --- /dev/null +++ b/data-utils/build.gradle @@ -0,0 +1,14 @@ +sourceCompatibility = 11 +targetCompatibility = 11 + +dependencies { + def Versions = project.Versions + + compileOnly("org.slf4j:slf4j-api:1.7.25") + + implementation("org.yaml:snakeyaml:${Versions.SNAKE_YAML}") +} + +jar { + archiveBaseName.set("${rootProject.name}-${project.name}") +} \ No newline at end of file diff --git a/data-utils/src/main/java/org/cedar/onestop/utils/ListUtils.java b/data-utils/src/main/java/org/cedar/onestop/utils/ListUtils.java new file mode 100644 index 000000000..6b26ad1d1 --- /dev/null +++ b/data-utils/src/main/java/org/cedar/onestop/utils/ListUtils.java @@ -0,0 +1,48 @@ +package org.cedar.onestop.utils; + +import java.util.ArrayList; +import java.util.List; + +public class ListUtils { + + public static List addOrInit(List list, T item) { + var result = new ArrayList(); + if (list != null && !list.isEmpty()) { + result.addAll(list); + } + if (item != null) { + result.add(item); + } + return result; + } + + /** + * + * @param list list to truncate + * @param maxListSize list size limit + * @param mostRecentAdditions if true, returned list reflects end of original list as opposed to start + * @param list object type + * @return truncated list of T objects + * @throws IllegalArgumentException if maxListSize is less than or equal to 0 + */ + public static List truncateList(List list, int maxListSize, boolean mostRecentAdditions) { + if (maxListSize <= 0) { + throw new IllegalArgumentException("Attempted to make a list of size [ " + maxListSize + " ]. " + + "Expected a size limit greater than 0."); + } + + var result = new ArrayList(); + if (list != null && !list.isEmpty()) { + var size = list.size(); + if(size <= maxListSize) { + result.addAll(list); + } + else { + var fromIndex = mostRecentAdditions ? size - maxListSize : 0; + var toIndex = mostRecentAdditions ? size : maxListSize; + result.addAll(list.subList(fromIndex, toIndex)); + } + } + return result; + } +} diff --git a/data-utils/src/main/java/org/cedar/onestop/utils/MapUtils.java b/data-utils/src/main/java/org/cedar/onestop/utils/MapUtils.java new file mode 100644 index 000000000..fe4525182 --- /dev/null +++ b/data-utils/src/main/java/org/cedar/onestop/utils/MapUtils.java @@ -0,0 +1,155 @@ +package org.cedar.onestop.utils; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import java.io.IOException; +import java.util.*; +import java.util.stream.Collectors; + +public class MapUtils { + +// public static Map parseJsonMap(String json) throws IOException { +// if (json == null || json == "") { +// return new LinkedHashMap(); +// } +// else { +// return new ObjectMapper().readValue(json, Map.class); +// } +// } + + /** + * Returns a merged Map of the original and toAdd Maps. Deep merges of nested Maps and Lists are performed and + * explicit duplicates (exact matches for all fields) are avoided. + * @param original Base Map to which elements will be merged from toAdd + * @param toAdd Map of elements to add to the original Map + * @return An updated original Map where new elements from toAdd have been merged. Returns empty Map if + * original and toAdd are empty or null. + */ + public static Map mergeMaps(Map original, Map toAdd) { + Map mergedMap = original == null ? new LinkedHashMap<>() : new LinkedHashMap<>(original); + if (original == null && toAdd == null) { + return Collections.emptyMap(); + } + if (original == null || original.size() == 0) { + return toAdd; + } + if (toAdd == null || toAdd.size() == 0) { + return original; + } + + toAdd.forEach((k, v) -> { + var originalValue = mergedMap.get(k); + if (v instanceof Map && originalValue instanceof Map) { + mergedMap.put(k, mergeMaps((Map) originalValue, (Map) v)); + } + else if (v instanceof List && originalValue instanceof List) { + var mergedList = new HashSet((List) originalValue); + mergedList.addAll((List) v); + mergedMap.put(k, new ArrayList(mergedList)); + } + else { + /* This overwrites simple values but also mismatched object types. Accepting that "risk" here since + useful errors are generated downstream for objects being cast to avro pojos but also because unknown JSON is + allowed to pass through later parsing/analysis steps untouched (either type change could be erroneous but + there's no way to know which) */ + mergedMap.put(k, v); + } + }); + + return mergedMap; + } + + /** + * Returns a new Map of the original with elements in toRemove discarded. Elements in toRemove must match those in + * original exactly, or they will not be removed. Handles nested Maps and Lists. + * @param original Base Map from which elements in toRemove will be removed + * @param toRemove Map of elements to remove from the original Map + * @return An updated original Map where matching elements from toRemove have been discarded. Returns empty Map if + * original is empty or null. + */ + public static Map removeFromMap(Map original, Map toRemove) { + Map mergedMap = original == null ? new LinkedHashMap<>() : new LinkedHashMap<>(original); + if (original == null && toRemove == null) { + return Collections.emptyMap(); + } + if (original == null || original.size() == 0) { + return Collections.emptyMap(); + } + if (toRemove == null || toRemove.size() == 0) { + return original; + } + + toRemove.forEach((k, v) -> { + var originalValue = mergedMap.get(k); + if (v instanceof Map && originalValue instanceof Map) { + mergedMap.put(k, removeFromMap((Map) originalValue, (Map) v)); + } + else if (v instanceof List && originalValue instanceof List) { + var mergedList = new HashSet<>((List) originalValue); + mergedList.removeAll((List) v); + mergedMap.put(k, mergedList); + } + else if ((v == null && originalValue == null) || v.equals(originalValue)) { + mergedMap.remove(k); + } + }); + + return mergedMap; + } + + /** + * Turns a nested map into a flat map with nested keys appended together with the delimiter + * @param parentKey Prefix that all flattened keys start with. Null, empty, or whitespace-only value results in no prefix + * @param delimiter String to delimit between each nested key. Defaults to "." if null or empty + * @param originalMap Nested-key map to be flattened + * @return Single-level map with flattened keys + */ + public static Map consolidateNestedKeysInMap(String parentKey, String delimiter, Map originalMap) { + var parent = (parentKey == null || parentKey.isBlank()) ? new String() : parentKey; + var delimiterString = (delimiter == null || delimiter.isEmpty()) ? "." : delimiter; + var newMap = new HashMap(); + + if(originalMap != null && !originalMap.isEmpty()) { + originalMap.forEach((k, v) -> { + String newKey = parent.isEmpty() ? k : parent + delimiterString + k; + if(v instanceof Map) { + newMap.putAll(consolidateNestedKeysInMap(newKey, delimiterString, (Map) v)); + } + else { + newMap.put(newKey, v); + } + }); + } + return newMap; + } + + /** + * Removes the given trimString from any keys in originalMap that match. For example a trim string 'abc.' would turn + * key 'abc.123' into key '123'. + * @param trimString Case insensitive prefix to remove from keys in originalMap + * @param originalMap + * @return New map with modified keys + */ + public static Map trimMapKeys(String trimString, Map originalMap) { + Map trimmedKeysMap = new LinkedHashMap<>(); + originalMap.forEach((k, v) -> { + String trimmedKey = k.toLowerCase().startsWith(trimString.toLowerCase()) ? k.substring(trimString.length()) : k; + trimmedKeysMap.put(trimmedKey, v); + }); + return trimmedKeysMap; + } + + /** + * Returns an map with all keys not contained in the given collection removed + * @param keysToKeep A collection of the keys to preserve in the filtered output; all others will be removed + * @return The filtered map + */ + public static Map filterMapKeys(Collection keysToKeep, Map originalMap) { + if (keysToKeep == null || keysToKeep.size() == 0) { + return new HashMap<>(); + } + return originalMap.entrySet().stream() + .filter(e -> keysToKeep.contains(e.getKey())) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } +} diff --git a/elastic-common/build.gradle b/elastic-common/build.gradle index 80c8ca9ee..ef207391c 100644 --- a/elastic-common/build.gradle +++ b/elastic-common/build.gradle @@ -6,6 +6,8 @@ dependencies { compileOnly("org.slf4j:slf4j-api:1.7.25") + implementation(project(':data-utils')) + implementation("org.elasticsearch.client:elasticsearch-rest-client:${Versions.ELASTIC}") implementation("org.elasticsearch.client:elasticsearch-rest-high-level-client:${Versions.ELASTIC}") implementation("com.fasterxml.jackson.core:jackson-databind:2.10.0") diff --git a/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java b/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java index 2910d2502..ac6951cb0 100644 --- a/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java +++ b/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java @@ -127,6 +127,7 @@ public String jsonMapping(String alias) { public Map indexedProperties(String alias) { var parsed = (Map) parsedMapping(alias); var mappings = (Map) parsed.getOrDefault("mappings", Collections.emptyMap()); + return (Map) mappings.getOrDefault("properties", Collections.emptyMap()); } diff --git a/kafka-common/src/main/java/org/cedar/onestop/kafka/common/util/DataUtils.java b/kafka-common/src/main/java/org/cedar/onestop/kafka/common/util/DataUtils.java index c77813cdc..8150b5739 100644 --- a/kafka-common/src/main/java/org/cedar/onestop/kafka/common/util/DataUtils.java +++ b/kafka-common/src/main/java/org/cedar/onestop/kafka/common/util/DataUtils.java @@ -200,6 +200,8 @@ public static Map filterMapKeys(Collection keysToKeep, M .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } + ////////// DELETE ^^^^^^ ////////////////// + /** * @param builderType type of schema builder either ParsedRecord or AggregatedInput, otherwise error out * @param fieldData parsed or input metadata values diff --git a/settings.gradle.kts b/settings.gradle.kts index 02047c692..1e852b37f 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -2,6 +2,7 @@ rootProject.name = "onestop" include( "client", + "data-utils", "e2e-tests", "elastic-common", "geoportal-search", From cb4eb38754d350a28e35f7e757ec30c8267fb93b Mon Sep 17 00:00:00 2001 From: Zeb Date: Wed, 27 May 2020 11:39:54 -0600 Subject: [PATCH 09/29] Got functions cleaning up the object based on nested ES mapping and fields we deliberately aren't indexing mostly sorted out. Lots of code cleanup and tests left. --- .../mappings/analysis_error_granuleIndex.json | 3 + .../indexer/util/TransformationUtils.java | 53 ++++++++++++-- .../util/TransformationUtilsSpec.groovy | 72 +++++++++++++++++-- 3 files changed, 117 insertions(+), 11 deletions(-) diff --git a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json index 9a86b537c..591132b3a 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json @@ -46,6 +46,9 @@ } } }, + "fileIdentifierExists": { + "type": "boolean" + }, "fileIdentifierString": { "type": "text", "fields": { diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index dfcb0b03d..3262a8d26 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -18,6 +18,7 @@ import static org.cedar.schemas.avro.psi.ValidDescriptor.UNDEFINED; import static org.cedar.schemas.avro.psi.ValidDescriptor.VALID; +import org.cedar.onestop.kafka.common.util.DataUtils; // TODO import org.apache.kafka.streams.StreamsBuilder; @@ -73,7 +74,29 @@ public static Map unfilteredAEMessage(ParsedRecord record) { return analysisMap; } - public static Map stuffToRemove(Map analysisMap, Map mapping) { + public static Map pruneKnownUnmappedFields(Map analysisMap, Map unmappedFields) { + + var result = new LinkedHashMap(); + analysisMap.forEach((k, v) -> { + if (!unmappedFields.containsKey(k)) { + result.put(k, v); + } else { + Map nestedProperties = (Map)((Map)unmappedFields.get(k)); // TODO almost identical to stuff to remove... but reversed... and no ".properties" layer... + + if (v instanceof Map) { + result.put(k, pruneKnownUnmappedFields((Map) v, nestedProperties)); + } else if (v instanceof List) { + var list = ((List) v).stream().map(item -> pruneKnownUnmappedFields((Map) item, nestedProperties)).filter(item -> !((Map)item).isEmpty()) + .collect(Collectors.toList()); + System.out.println("ZEB - list: "+list); + result.put(k, list); + } + } + }); + return result; + } + + public static Map identifyUnmappedFields(Map analysisMap, Map mapping) { var result = new LinkedHashMap(); // analysisMap.entrySet().stream().forEach(e -> { // if( !mapping.containsKey(e.getKey())) { @@ -82,15 +105,31 @@ public static Map stuffToRemove(Map analysisMap, // if (e.getValue() instanceof Map){ // System.out.println("ZEB: the value is a map!"); // // System.out.println("mapping: "+mapping.get(e.getKey()).get("properties")); - // // System.out.println("--> "+stuffToRemove((Map)e.getValue(), (Map)mapping.get(e.getKey()).get("properties"))); - // result.put(e.getKey(), stuffToRemove((Map)e.getValue(), (Map)((Map)mapping.get(e.getKey())).get("properties"))); // TODO brute force assumes mapping is an object map string:object here too + // // System.out.println("--> "+identifyUnmappedFields((Map)e.getValue(), (Map)mapping.get(e.getKey()).get("properties"))); + // result.put(e.getKey(), identifyUnmappedFields((Map)e.getValue(), (Map)((Map)mapping.get(e.getKey())).get("properties"))); // TODO brute force assumes mapping is an object map string:object here too // } else if(e.getValue() instanceof Collection){ // // TODO!!!! - // // result.put(e.getKey(), ((Collection)e.getValue()).filter(item -> !stuffToRemove((Map)item, (Map)((Map)mapping.get(e.getKey())).get("properties"))).isEmpty()); + // // result.put(e.getKey(), ((Collection)e.getValue()).filter(item -> !identifyUnmappedFields((Map)item, (Map)((Map)mapping.get(e.getKey())).get("properties"))).isEmpty()); // } // } // }); // return result; + // + // const knownUnmappedTemporalFields = new HashMap(); + // knownUnmappedTemporalFields.put("beginYear", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("beginDayOfYear", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("beginDayOfMonth", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("beginMonth", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("endYear", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("endDayOfYear", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("endDayOfMonth", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("endMonth", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("instantYear", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("instantDayOfYear", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("instantDayOfMonth", "mapped to search index instead"); + // knownUnmappedTemporalFields.put("instantMonth", "mapped to search index instead"); + // const knownUnmappedFields = new HashMap(); + // knownUnmappedFields.put("temporalBounding", knownUnmappedTemporalFields); analysisMap.forEach((k, v) -> { if (!mapping.containsKey(k)) { @@ -98,10 +137,12 @@ public static Map stuffToRemove(Map analysisMap, } else { Map nestedProperties = (Map)((Map)mapping.get(k)).get("properties"); // TODO assumes mapping is also a Map! + // Map knownUnmapped = (Map)knownUnmappedFields.get(k); + if (v instanceof Map) { - result.put(k, stuffToRemove((Map) v, nestedProperties)); + result.put(k, identifyUnmappedFields((Map) v, nestedProperties)); } else if (v instanceof List) { - var list = ((List) v).stream().map(item -> stuffToRemove((Map) item, nestedProperties)).filter(item -> !((Map)item).isEmpty()) + var list = ((List) v).stream().map(item -> identifyUnmappedFields((Map) item, nestedProperties)).filter(item -> !((Map)item).isEmpty()) .collect(Collectors.toList()); System.out.println("ZEB - list: "+list); result.put(k, list); diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index f9e6e5c6c..59a3f0608 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -20,6 +20,8 @@ import groovy.json.JsonOutput import static org.cedar.schemas.avro.util.TemporalTestData.getSituations +import org.cedar.onestop.kafka.common.util.DataUtils; + @Unroll class TransformationUtilsSpec extends Specification { @@ -96,7 +98,7 @@ class TransformationUtilsSpec extends Specification { def result = TransformationUtils.reformatMessageForAnalysisAndErrors(TestUtils.inputGranuleRecord, granuleAnalysisErrorFields) - def asdf = TransformationUtils.stuffToRemove(TransformationUtils.unfilteredAEMessage(TestUtils.inputGranuleRecord), TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) + def asdf = TransformationUtils.identifyUnmappedFields(TransformationUtils.unfilteredAEMessage(TestUtils.inputGranuleRecord), TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) println("ZEB") println(result) @@ -108,6 +110,33 @@ class TransformationUtilsSpec extends Specification { def "can i construct a record"() { when: + println("YO ZEB") + // println( + // DataUtils.wipMapKeys('type', DataUtils.wipMapKeys('properties', DataUtils.consolidateNestedKeysInMap(null, ".", TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)))) + // ) + + + def esmapping = DataUtils.wipMapKeys('type', DataUtils.wipMapKeys('properties', DataUtils.consolidateNestedKeysInMap(null, ".", TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)))) + println("esmapping: "+JsonOutput.toJson(esmapping)) + + + + def knownUnmappedTemporalFields = new HashMap(); + knownUnmappedTemporalFields.put("beginYear", new HashMap()); + knownUnmappedTemporalFields.put("beginDayOfYear", new HashMap()); + knownUnmappedTemporalFields.put("beginDayOfMonth", new HashMap()); + knownUnmappedTemporalFields.put("beginMonth", new HashMap()); + knownUnmappedTemporalFields.put("endYear", new HashMap()); + knownUnmappedTemporalFields.put("endDayOfYear", new HashMap()); + knownUnmappedTemporalFields.put("endDayOfMonth", new HashMap()); + knownUnmappedTemporalFields.put("endMonth", new HashMap()); + knownUnmappedTemporalFields.put("instantYear", new HashMap()); + knownUnmappedTemporalFields.put("instantDayOfYear", new HashMap()); + knownUnmappedTemporalFields.put("instantDayOfMonth", new HashMap()); + knownUnmappedTemporalFields.put("instantMonth", new HashMap()); + def knownUnmappedFields = new HashMap(); + knownUnmappedFields.put("temporalBounding", knownUnmappedTemporalFields); + ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) .setAnalysis( Analysis.newBuilder().setTemporalBounding( @@ -123,13 +152,46 @@ class TransformationUtilsSpec extends Specification { .setBeginDayOfMonth(1) .build() ).build()).build() - def asdf = TransformationUtils.stuffToRemove(TransformationUtils.unfilteredAEMessage(record), TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) - println("ZEB") - println(JsonOutput.toJson(asdf)) + def parsed = TransformationUtils.unfilteredAEMessage(record) + def asdf = TransformationUtils.identifyUnmappedFields(parsed, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) + + println("ZEB from "+JsonOutput.toJson(parsed)) + println("ZEB minus "+JsonOutput.toJson(asdf)) + println("AND CLEANED? "+JsonOutput.toJson(DataUtils.removeFromMap((parsed), asdf))) + def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, knownUnmappedFields) + println("pruned unampped? "+JsonOutput.toJson(pruned)) + def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) + println("creates minus: "+JsonOutput.toJson(minus)) + println("which results in indexing: "+ JsonOutput.toJson(DataUtils.removeFromMap(pruned, minus))) + asdf.get("temporalBounding").remove("instantYear") + + asdf.get("temporalBounding").remove("beginYear") + asdf.get("temporalBounding").remove("beginDayOfYear") + asdf.get("temporalBounding").remove("beginDayOfMonth") + asdf.get("temporalBounding").remove("beginMonth") + asdf.get("temporalBounding").remove("endYear") + asdf.get("temporalBounding").remove("endDayOfYear") + asdf.get("temporalBounding").remove("endDayOfMonth") + asdf.get("temporalBounding").remove("endMonth") + asdf.get("temporalBounding").remove("instantYear") + asdf.get("temporalBounding").remove("instantDayOfYear") + asdf.get("temporalBounding").remove("instantDayOfMonth") + asdf.get("temporalBounding").remove("instantMonth") + println("cleaned up for dumb logging: "+JsonOutput.toJson(asdf)) + + def objkeys = DataUtils.consolidateNestedKeysInMap(null, ".", parsed) + println("objkeys: "+JsonOutput.toJson(objkeys)) + def junkToremove = DataUtils.filterMapKeys(esmapping.keySet(), objkeys) + println("remove me:" + JsonOutput.toJson(junkToremove)) + // def trimmed = DataUtils.filterMapKeys(junkToremove.keySet(), objkeys) + def trimmed = DataUtils.removeFromMap(objkeys, junkToremove) + println("fixed: "+JsonOutput.toJson(trimmed)) + then: - asdf.keySet().each({ assert granuleAnalysisErrorFields.contains(it) }) + // asdf.keySet().each({ assert granuleAnalysisErrorFields.contains(it) }) + trimmed == [ "foo" : "bar"] } //////////////////////////////// From be1dfe8965821902dea487ca3280a07c6c543804 Mon Sep 17 00:00:00 2001 From: Zeb Date: Wed, 27 May 2020 12:00:34 -0600 Subject: [PATCH 10/29] Begin cleaning up test... --- .../indexer/util/TransformationUtils.java | 15 ---- .../util/TransformationUtilsSpec.groovy | 90 +++++++++---------- 2 files changed, 40 insertions(+), 65 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index 3262a8d26..9cfb43edc 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -115,21 +115,6 @@ public static Map identifyUnmappedFields(Map ana // }); // return result; // - // const knownUnmappedTemporalFields = new HashMap(); - // knownUnmappedTemporalFields.put("beginYear", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("beginDayOfYear", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("beginDayOfMonth", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("beginMonth", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("endYear", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("endDayOfYear", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("endDayOfMonth", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("endMonth", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("instantYear", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("instantDayOfYear", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("instantDayOfMonth", "mapped to search index instead"); - // knownUnmappedTemporalFields.put("instantMonth", "mapped to search index instead"); - // const knownUnmappedFields = new HashMap(); - // knownUnmappedFields.put("temporalBounding", knownUnmappedTemporalFields); analysisMap.forEach((k, v) -> { if (!mapping.containsKey(k)) { diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index 59a3f0608..9b07c0d36 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -110,14 +110,19 @@ class TransformationUtilsSpec extends Specification { def "can i construct a record"() { when: - println("YO ZEB") + def parsed = [identification:null, titles:null, description:null, dataAccess:null, thumbnail:null, temporalBounding:[ + beginDescriptor:ValidDescriptor.VALID, beginPrecision:ChronoUnit.DAYS.toString(), beginIndexable:true, beginZoneSpecified:null, beginUtcDateTimeString:2000-02-01, beginYear:2000, beginDayOfYear:32, beginDayOfMonth:1, beginMonth:2, + endDescriptor:null, endPrecision:null, endIndexable:null, endZoneSpecified:null, endUtcDateTimeString:null, endYear:null, endDayOfYear:null, endDayOfMonth:null, endMonth:null, + instantDescriptor:null, instantPrecision:null, instantIndexable:null, instantZoneSpecified:null, instantUtcDateTimeString:null, instantYear:null, instantDayOfYear:null, instantDayOfMonth:null, instantMonth:null, + rangeDescriptor:null], + spatialBounding:null, internalParentIdentifier:null, errors:[[nonsense:"horrible", source:"valid field"]], garbage:"nuke meeee"] // println( // DataUtils.wipMapKeys('type', DataUtils.wipMapKeys('properties', DataUtils.consolidateNestedKeysInMap(null, ".", TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)))) // ) - def esmapping = DataUtils.wipMapKeys('type', DataUtils.wipMapKeys('properties', DataUtils.consolidateNestedKeysInMap(null, ".", TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)))) - println("esmapping: "+JsonOutput.toJson(esmapping)) + // def esmapping = DataUtils.wipMapKeys('type', DataUtils.wipMapKeys('properties', DataUtils.consolidateNestedKeysInMap(null, ".", TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)))) + // println("esmapping: "+JsonOutput.toJson(esmapping)) @@ -137,61 +142,46 @@ class TransformationUtilsSpec extends Specification { def knownUnmappedFields = new HashMap(); knownUnmappedFields.put("temporalBounding", knownUnmappedTemporalFields); - ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) - .setAnalysis( - Analysis.newBuilder().setTemporalBounding( - TemporalBoundingAnalysis.newBuilder() - .setBeginDescriptor(ValidDescriptor.VALID) - .setBeginIndexable(true) - .setBeginPrecision(ChronoUnit.DAYS.toString()) - .setBeginZoneSpecified(null) - .setBeginUtcDateTimeString("2000-02-01") - .setBeginYear(2000) - .setBeginMonth(2) - .setBeginDayOfYear(32) - .setBeginDayOfMonth(1) - .build() - ).build()).build() - - def parsed = TransformationUtils.unfilteredAEMessage(record) - def asdf = TransformationUtils.identifyUnmappedFields(parsed, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) - - println("ZEB from "+JsonOutput.toJson(parsed)) - println("ZEB minus "+JsonOutput.toJson(asdf)) - println("AND CLEANED? "+JsonOutput.toJson(DataUtils.removeFromMap((parsed), asdf))) + // ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) + // .setAnalysis( + // Analysis.newBuilder().setTemporalBounding( + // TemporalBoundingAnalysis.newBuilder() + // .setBeginDescriptor(ValidDescriptor.VALID) + // .setBeginIndexable(true) + // .setBeginPrecision(ChronoUnit.DAYS.toString()) + // .setBeginZoneSpecified(null) + // .setBeginUtcDateTimeString("2000-02-01") + // .setBeginYear(2000) + // .setBeginMonth(2) + // .setBeginDayOfYear(32) + // .setBeginDayOfMonth(1) + // .build() + // ).build()).build() + + // def parsed = TransformationUtils.unfilteredAEMessage(record) + // def asdf = TransformationUtils.identifyUnmappedFields(parsed, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) + + println("parsed "+JsonOutput.toJson(parsed)) + // println("???"+parsed) + // println("in groov"+ [identification:null, titles:null, description:null, dataAccess:null, thumbnail:null, temporalBounding:[ + // beginDescriptor:ValidDescriptor.VALID, beginPrecision:ChronoUnit.DAYS.toString(), beginIndexable:true, beginZoneSpecified:null, beginUtcDateTimeString:2000-02-01, beginYear:2000, beginDayOfYear:32, beginDayOfMonth:1, beginMonth:2, + // endDescriptor:null, endPrecision:null, endIndexable:null, endZoneSpecified:null, endUtcDateTimeString:null, endYear:null, endDayOfYear:null, endDayOfMonth:null, endMonth:null, + // instantDescriptor:null, instantPrecision:null, instantIndexable:null, instantZoneSpecified:null, instantUtcDateTimeString:null, instantYear:null, instantDayOfYear:null, instantDayOfMonth:null, instantMonth:null, + // rangeDescriptor:null], + // spatialBounding:null, internalParentIdentifier:null, errors:[[nonsense:"horrible", source:"valid field"]], garbage:"nuke meeee"] +// ) def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, knownUnmappedFields) println("pruned unampped? "+JsonOutput.toJson(pruned)) def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) println("creates minus: "+JsonOutput.toJson(minus)) println("which results in indexing: "+ JsonOutput.toJson(DataUtils.removeFromMap(pruned, minus))) - asdf.get("temporalBounding").remove("instantYear") - - asdf.get("temporalBounding").remove("beginYear") - asdf.get("temporalBounding").remove("beginDayOfYear") - asdf.get("temporalBounding").remove("beginDayOfMonth") - asdf.get("temporalBounding").remove("beginMonth") - asdf.get("temporalBounding").remove("endYear") - asdf.get("temporalBounding").remove("endDayOfYear") - asdf.get("temporalBounding").remove("endDayOfMonth") - asdf.get("temporalBounding").remove("endMonth") - asdf.get("temporalBounding").remove("instantYear") - asdf.get("temporalBounding").remove("instantDayOfYear") - asdf.get("temporalBounding").remove("instantDayOfMonth") - asdf.get("temporalBounding").remove("instantMonth") - println("cleaned up for dumb logging: "+JsonOutput.toJson(asdf)) - - def objkeys = DataUtils.consolidateNestedKeysInMap(null, ".", parsed) - println("objkeys: "+JsonOutput.toJson(objkeys)) - def junkToremove = DataUtils.filterMapKeys(esmapping.keySet(), objkeys) - println("remove me:" + JsonOutput.toJson(junkToremove)) - // def trimmed = DataUtils.filterMapKeys(junkToremove.keySet(), objkeys) - def trimmed = DataUtils.removeFromMap(objkeys, junkToremove) - println("fixed: "+JsonOutput.toJson(trimmed)) - then: // asdf.keySet().each({ assert granuleAnalysisErrorFields.contains(it) }) - trimmed == [ "foo" : "bar"] + // trimmed == [ "foo" : "bar"] + minus == [temporalBounding:[ + instantIndexable:null], + errors:[[nonsense:"horrible"]], garbage:"nuke meeee"] } //////////////////////////////// From acdb286ff244468a0d021bb23fa1ce86d55b0cda Mon Sep 17 00:00:00 2001 From: arianna Date: Wed, 27 May 2020 15:33:22 -0600 Subject: [PATCH 11/29] Revert "wip making new data-utils module" This reverts commit 21d3448a40ea3aef54a3f049faf49f8c350f3ce0. --- data-utils/build.gradle | 14 -- .../org/cedar/onestop/utils/ListUtils.java | 48 ------ .../org/cedar/onestop/utils/MapUtils.java | 155 ------------------ elastic-common/build.gradle | 2 - .../elastic/common/ElasticsearchConfig.java | 1 - .../onestop/kafka/common/util/DataUtils.java | 2 - settings.gradle.kts | 1 - 7 files changed, 223 deletions(-) delete mode 100644 data-utils/build.gradle delete mode 100644 data-utils/src/main/java/org/cedar/onestop/utils/ListUtils.java delete mode 100644 data-utils/src/main/java/org/cedar/onestop/utils/MapUtils.java diff --git a/data-utils/build.gradle b/data-utils/build.gradle deleted file mode 100644 index 96c856cb7..000000000 --- a/data-utils/build.gradle +++ /dev/null @@ -1,14 +0,0 @@ -sourceCompatibility = 11 -targetCompatibility = 11 - -dependencies { - def Versions = project.Versions - - compileOnly("org.slf4j:slf4j-api:1.7.25") - - implementation("org.yaml:snakeyaml:${Versions.SNAKE_YAML}") -} - -jar { - archiveBaseName.set("${rootProject.name}-${project.name}") -} \ No newline at end of file diff --git a/data-utils/src/main/java/org/cedar/onestop/utils/ListUtils.java b/data-utils/src/main/java/org/cedar/onestop/utils/ListUtils.java deleted file mode 100644 index 6b26ad1d1..000000000 --- a/data-utils/src/main/java/org/cedar/onestop/utils/ListUtils.java +++ /dev/null @@ -1,48 +0,0 @@ -package org.cedar.onestop.utils; - -import java.util.ArrayList; -import java.util.List; - -public class ListUtils { - - public static List addOrInit(List list, T item) { - var result = new ArrayList(); - if (list != null && !list.isEmpty()) { - result.addAll(list); - } - if (item != null) { - result.add(item); - } - return result; - } - - /** - * - * @param list list to truncate - * @param maxListSize list size limit - * @param mostRecentAdditions if true, returned list reflects end of original list as opposed to start - * @param list object type - * @return truncated list of T objects - * @throws IllegalArgumentException if maxListSize is less than or equal to 0 - */ - public static List truncateList(List list, int maxListSize, boolean mostRecentAdditions) { - if (maxListSize <= 0) { - throw new IllegalArgumentException("Attempted to make a list of size [ " + maxListSize + " ]. " + - "Expected a size limit greater than 0."); - } - - var result = new ArrayList(); - if (list != null && !list.isEmpty()) { - var size = list.size(); - if(size <= maxListSize) { - result.addAll(list); - } - else { - var fromIndex = mostRecentAdditions ? size - maxListSize : 0; - var toIndex = mostRecentAdditions ? size : maxListSize; - result.addAll(list.subList(fromIndex, toIndex)); - } - } - return result; - } -} diff --git a/data-utils/src/main/java/org/cedar/onestop/utils/MapUtils.java b/data-utils/src/main/java/org/cedar/onestop/utils/MapUtils.java deleted file mode 100644 index fe4525182..000000000 --- a/data-utils/src/main/java/org/cedar/onestop/utils/MapUtils.java +++ /dev/null @@ -1,155 +0,0 @@ -package org.cedar.onestop.utils; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import java.io.IOException; -import java.util.*; -import java.util.stream.Collectors; - -public class MapUtils { - -// public static Map parseJsonMap(String json) throws IOException { -// if (json == null || json == "") { -// return new LinkedHashMap(); -// } -// else { -// return new ObjectMapper().readValue(json, Map.class); -// } -// } - - /** - * Returns a merged Map of the original and toAdd Maps. Deep merges of nested Maps and Lists are performed and - * explicit duplicates (exact matches for all fields) are avoided. - * @param original Base Map to which elements will be merged from toAdd - * @param toAdd Map of elements to add to the original Map - * @return An updated original Map where new elements from toAdd have been merged. Returns empty Map if - * original and toAdd are empty or null. - */ - public static Map mergeMaps(Map original, Map toAdd) { - Map mergedMap = original == null ? new LinkedHashMap<>() : new LinkedHashMap<>(original); - if (original == null && toAdd == null) { - return Collections.emptyMap(); - } - if (original == null || original.size() == 0) { - return toAdd; - } - if (toAdd == null || toAdd.size() == 0) { - return original; - } - - toAdd.forEach((k, v) -> { - var originalValue = mergedMap.get(k); - if (v instanceof Map && originalValue instanceof Map) { - mergedMap.put(k, mergeMaps((Map) originalValue, (Map) v)); - } - else if (v instanceof List && originalValue instanceof List) { - var mergedList = new HashSet((List) originalValue); - mergedList.addAll((List) v); - mergedMap.put(k, new ArrayList(mergedList)); - } - else { - /* This overwrites simple values but also mismatched object types. Accepting that "risk" here since - useful errors are generated downstream for objects being cast to avro pojos but also because unknown JSON is - allowed to pass through later parsing/analysis steps untouched (either type change could be erroneous but - there's no way to know which) */ - mergedMap.put(k, v); - } - }); - - return mergedMap; - } - - /** - * Returns a new Map of the original with elements in toRemove discarded. Elements in toRemove must match those in - * original exactly, or they will not be removed. Handles nested Maps and Lists. - * @param original Base Map from which elements in toRemove will be removed - * @param toRemove Map of elements to remove from the original Map - * @return An updated original Map where matching elements from toRemove have been discarded. Returns empty Map if - * original is empty or null. - */ - public static Map removeFromMap(Map original, Map toRemove) { - Map mergedMap = original == null ? new LinkedHashMap<>() : new LinkedHashMap<>(original); - if (original == null && toRemove == null) { - return Collections.emptyMap(); - } - if (original == null || original.size() == 0) { - return Collections.emptyMap(); - } - if (toRemove == null || toRemove.size() == 0) { - return original; - } - - toRemove.forEach((k, v) -> { - var originalValue = mergedMap.get(k); - if (v instanceof Map && originalValue instanceof Map) { - mergedMap.put(k, removeFromMap((Map) originalValue, (Map) v)); - } - else if (v instanceof List && originalValue instanceof List) { - var mergedList = new HashSet<>((List) originalValue); - mergedList.removeAll((List) v); - mergedMap.put(k, mergedList); - } - else if ((v == null && originalValue == null) || v.equals(originalValue)) { - mergedMap.remove(k); - } - }); - - return mergedMap; - } - - /** - * Turns a nested map into a flat map with nested keys appended together with the delimiter - * @param parentKey Prefix that all flattened keys start with. Null, empty, or whitespace-only value results in no prefix - * @param delimiter String to delimit between each nested key. Defaults to "." if null or empty - * @param originalMap Nested-key map to be flattened - * @return Single-level map with flattened keys - */ - public static Map consolidateNestedKeysInMap(String parentKey, String delimiter, Map originalMap) { - var parent = (parentKey == null || parentKey.isBlank()) ? new String() : parentKey; - var delimiterString = (delimiter == null || delimiter.isEmpty()) ? "." : delimiter; - var newMap = new HashMap(); - - if(originalMap != null && !originalMap.isEmpty()) { - originalMap.forEach((k, v) -> { - String newKey = parent.isEmpty() ? k : parent + delimiterString + k; - if(v instanceof Map) { - newMap.putAll(consolidateNestedKeysInMap(newKey, delimiterString, (Map) v)); - } - else { - newMap.put(newKey, v); - } - }); - } - return newMap; - } - - /** - * Removes the given trimString from any keys in originalMap that match. For example a trim string 'abc.' would turn - * key 'abc.123' into key '123'. - * @param trimString Case insensitive prefix to remove from keys in originalMap - * @param originalMap - * @return New map with modified keys - */ - public static Map trimMapKeys(String trimString, Map originalMap) { - Map trimmedKeysMap = new LinkedHashMap<>(); - originalMap.forEach((k, v) -> { - String trimmedKey = k.toLowerCase().startsWith(trimString.toLowerCase()) ? k.substring(trimString.length()) : k; - trimmedKeysMap.put(trimmedKey, v); - }); - return trimmedKeysMap; - } - - /** - * Returns an map with all keys not contained in the given collection removed - * @param keysToKeep A collection of the keys to preserve in the filtered output; all others will be removed - * @return The filtered map - */ - public static Map filterMapKeys(Collection keysToKeep, Map originalMap) { - if (keysToKeep == null || keysToKeep.size() == 0) { - return new HashMap<>(); - } - return originalMap.entrySet().stream() - .filter(e -> keysToKeep.contains(e.getKey())) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - } -} diff --git a/elastic-common/build.gradle b/elastic-common/build.gradle index ef207391c..80c8ca9ee 100644 --- a/elastic-common/build.gradle +++ b/elastic-common/build.gradle @@ -6,8 +6,6 @@ dependencies { compileOnly("org.slf4j:slf4j-api:1.7.25") - implementation(project(':data-utils')) - implementation("org.elasticsearch.client:elasticsearch-rest-client:${Versions.ELASTIC}") implementation("org.elasticsearch.client:elasticsearch-rest-high-level-client:${Versions.ELASTIC}") implementation("com.fasterxml.jackson.core:jackson-databind:2.10.0") diff --git a/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java b/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java index ac6951cb0..2910d2502 100644 --- a/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java +++ b/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java @@ -127,7 +127,6 @@ public String jsonMapping(String alias) { public Map indexedProperties(String alias) { var parsed = (Map) parsedMapping(alias); var mappings = (Map) parsed.getOrDefault("mappings", Collections.emptyMap()); - return (Map) mappings.getOrDefault("properties", Collections.emptyMap()); } diff --git a/kafka-common/src/main/java/org/cedar/onestop/kafka/common/util/DataUtils.java b/kafka-common/src/main/java/org/cedar/onestop/kafka/common/util/DataUtils.java index 8150b5739..c77813cdc 100644 --- a/kafka-common/src/main/java/org/cedar/onestop/kafka/common/util/DataUtils.java +++ b/kafka-common/src/main/java/org/cedar/onestop/kafka/common/util/DataUtils.java @@ -200,8 +200,6 @@ public static Map filterMapKeys(Collection keysToKeep, M .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } - ////////// DELETE ^^^^^^ ////////////////// - /** * @param builderType type of schema builder either ParsedRecord or AggregatedInput, otherwise error out * @param fieldData parsed or input metadata values diff --git a/settings.gradle.kts b/settings.gradle.kts index 1e852b37f..02047c692 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -2,7 +2,6 @@ rootProject.name = "onestop" include( "client", - "data-utils", "e2e-tests", "elastic-common", "geoportal-search", From d525128a9b7d22e18eb04b719e327a9ea04ab0b6 Mon Sep 17 00:00:00 2001 From: Zeb Date: Wed, 27 May 2020 15:58:58 -0600 Subject: [PATCH 12/29] Fixing up tests. --- .../analysis_error_collectionIndex.json | 6 + .../mappings/analysis_error_granuleIndex.json | 6 + .../util/TransformationUtilsSpec.groovy | 201 ++++++++++++++---- 3 files changed, 175 insertions(+), 38 deletions(-) diff --git a/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json index ba5e2d670..3bbb551f4 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_collectionIndex.json @@ -108,6 +108,9 @@ "endUtcDateTimeString": { "type": "keyword" }, + "endZoneSpecified": { + "type": "keyword" + }, "instantDescriptor": { "type": "keyword" }, @@ -120,6 +123,9 @@ "instantUtcDateTimeString": { "type": "keyword" }, + "instantZoneSpecified": { + "type": "keyword" + }, "rangeDescriptor": { "type": "keyword" } diff --git a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json index 591132b3a..5cd6136c2 100644 --- a/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json +++ b/elastic-common/src/main/resources/mappings/analysis_error_granuleIndex.json @@ -114,6 +114,9 @@ "endUtcDateTimeString": { "type": "keyword" }, + "endZoneSpecified": { + "type": "keyword" + }, "instantDescriptor": { "type": "keyword" }, @@ -126,6 +129,9 @@ "instantUtcDateTimeString": { "type": "keyword" }, + "instantZoneSpecified": { + "type": "keyword" + }, "rangeDescriptor": { "type": "keyword" } diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index 9b07c0d36..e7f29ddf7 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -108,21 +108,55 @@ class TransformationUtilsSpec extends Specification { result.keySet().each({ assert granuleAnalysisErrorFields.contains(it) }) } - def "can i construct a record"() { + def "clean up nested map before indexing strictly mapped fields"() { when: - def parsed = [identification:null, titles:null, description:null, dataAccess:null, thumbnail:null, temporalBounding:[ - beginDescriptor:ValidDescriptor.VALID, beginPrecision:ChronoUnit.DAYS.toString(), beginIndexable:true, beginZoneSpecified:null, beginUtcDateTimeString:2000-02-01, beginYear:2000, beginDayOfYear:32, beginDayOfMonth:1, beginMonth:2, - endDescriptor:null, endPrecision:null, endIndexable:null, endZoneSpecified:null, endUtcDateTimeString:null, endYear:null, endDayOfYear:null, endDayOfMonth:null, endMonth:null, - instantDescriptor:null, instantPrecision:null, instantIndexable:null, instantZoneSpecified:null, instantUtcDateTimeString:null, instantYear:null, instantDayOfYear:null, instantDayOfMonth:null, instantMonth:null, - rangeDescriptor:null], - spatialBounding:null, internalParentIdentifier:null, errors:[[nonsense:"horrible", source:"valid field"]], garbage:"nuke meeee"] - // println( - // DataUtils.wipMapKeys('type', DataUtils.wipMapKeys('properties', DataUtils.consolidateNestedKeysInMap(null, ".", TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)))) - // ) - - - // def esmapping = DataUtils.wipMapKeys('type', DataUtils.wipMapKeys('properties', DataUtils.consolidateNestedKeysInMap(null, ".", TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)))) - // println("esmapping: "+JsonOutput.toJson(esmapping)) + def parsed = [ + identification: null, + titles: null, + description: null, + dataAccess: null, + thumbnail: null, + temporalBounding: [ + beginDescriptor: ValidDescriptor.VALID, + beginPrecision: ChronoUnit.DAYS.toString(), + beginIndexable: true, + beginZoneSpecified: null, + beginUtcDateTimeString: "2000-02-01", + beginYear: 2000, + beginDayOfYear: 32, + beginDayOfMonth: 1, + beginMonth: 2, + endDescriptor: null, + endPrecision: null, + endIndexable: null, + endZoneSpecified: null, + endUtcDateTimeString: null, + endYear: null, + endDayOfYear: null, + endDayOfMonth: null, + endMonth: null, + instantDescriptor: null, + instantPrecision: null, + instantIndexable: null, + instantZoneSpecified: null, + instantUtcDateTimeString: null, + instantYear: null, + instantDayOfYear: null, + instantDayOfMonth: null, + instantMonth: null, + rangeDescriptor: null, + fakeField: 123 + ], + spatialBounding: null, + internalParentIdentifier: null, + errors: [ + [ + nonsense: "horrible", + source: "valid field" + ] + ], + garbage:"nuke meeee" + ] @@ -158,30 +192,121 @@ class TransformationUtilsSpec extends Specification { // .build() // ).build()).build() - // def parsed = TransformationUtils.unfilteredAEMessage(record) - // def asdf = TransformationUtils.identifyUnmappedFields(parsed, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) - - println("parsed "+JsonOutput.toJson(parsed)) - // println("???"+parsed) - // println("in groov"+ [identification:null, titles:null, description:null, dataAccess:null, thumbnail:null, temporalBounding:[ - // beginDescriptor:ValidDescriptor.VALID, beginPrecision:ChronoUnit.DAYS.toString(), beginIndexable:true, beginZoneSpecified:null, beginUtcDateTimeString:2000-02-01, beginYear:2000, beginDayOfYear:32, beginDayOfMonth:1, beginMonth:2, - // endDescriptor:null, endPrecision:null, endIndexable:null, endZoneSpecified:null, endUtcDateTimeString:null, endYear:null, endDayOfYear:null, endDayOfMonth:null, endMonth:null, - // instantDescriptor:null, instantPrecision:null, instantIndexable:null, instantZoneSpecified:null, instantUtcDateTimeString:null, instantYear:null, instantDayOfYear:null, instantDayOfMonth:null, instantMonth:null, - // rangeDescriptor:null], - // spatialBounding:null, internalParentIdentifier:null, errors:[[nonsense:"horrible", source:"valid field"]], garbage:"nuke meeee"] -// ) - def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, knownUnmappedFields) - println("pruned unampped? "+JsonOutput.toJson(pruned)) - def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) - println("creates minus: "+JsonOutput.toJson(minus)) - println("which results in indexing: "+ JsonOutput.toJson(DataUtils.removeFromMap(pruned, minus))) - - then: - // asdf.keySet().each({ assert granuleAnalysisErrorFields.contains(it) }) - // trimmed == [ "foo" : "bar"] - minus == [temporalBounding:[ - instantIndexable:null], - errors:[[nonsense:"horrible"]], garbage:"nuke meeee"] + // def parsed = TransformationUtils.unfilteredAEMessage(record) + + println("parsed "+JsonOutput.toJson(parsed)) + def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, knownUnmappedFields) + println("pruned unampped? "+JsonOutput.toJson(pruned)) + def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) + println("creates minus: "+JsonOutput.toJson(minus)) + def indexedRecord = DataUtils.removeFromMap(pruned, minus) + println("which results in indexing: "+ JsonOutput.toJson(indexedRecord)) + + then: + minus == [ + temporalBounding: [ + fakeField: 123 + ], + errors: [ + [ + nonsense: "horrible", + ] + ], + garbage:"nuke meeee" + ] + + // println("wtf"+JsonOutput.toJson(indexedRecord)) + // println("wtf"+JsonOutput.toJson([ + // identification: null, + // titles: null, + // description: null, + // dataAccess: null, + // thumbnail: null, + // temporalBounding: [ + // beginDescriptor: ValidDescriptor.VALID, + // beginPrecision: ChronoUnit.DAYS.toString(), + // beginIndexable: true, + // beginZoneSpecified: null, + // beginUtcDateTimeString: "2000-02-01", + // endDescriptor: null, + // endPrecision: null, + // endIndexable: null, + // endZoneSpecified: null, + // endUtcDateTimeString: null, + // instantDescriptor: null, + // instantPrecision: null, + // instantIndexable: null, + // instantZoneSpecified: null, + // instantUtcDateTimeString: null, + // rangeDescriptor: null + // ], + // spatialBounding: null, + // internalParentIdentifier: null, + // errors: [ + // [nonsense:"horrible", + // source: "valid field" + // ] + // ] + // ])) + // assert indexedRecord == [ + // identification: null, + // titles: null, + // description: null, + // dataAccess: null, + // thumbnail: null, + // temporalBounding: [ + // beginDescriptor: ValidDescriptor.VALID, + // beginPrecision: ChronoUnit.DAYS.toString(), + // beginIndexable: true, + // beginZoneSpecified: null, + // beginUtcDateTimeString: "2000-02-01", + // endDescriptor: null, + // endPrecision: null, + // endIndexable: null, + // endZoneSpecified: null, + // endUtcDateTimeString: null, + // instantDescriptor: null, + // instantPrecision: null, + // instantIndexable: null, + // instantZoneSpecified: null, + // instantUtcDateTimeString: null, + // rangeDescriptor: null + // ], + // spatialBounding: null, + // internalParentIdentifier: null, + // errors: [ + // [nonsense:"horrible", // FIXME this is not actually desired + // source: "valid field" + // ] + // ] + // ] + def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "internalParentIdentifier", "errors" ] + indexedRecord.keySet().size() == expectedKeyset.size() + indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) + + indexedRecord.temporalBounding == [ + beginDescriptor: ValidDescriptor.VALID, + beginPrecision: ChronoUnit.DAYS.toString(), + beginIndexable: true, + beginZoneSpecified: null, + beginUtcDateTimeString: "2000-02-01", + endDescriptor: null, + endPrecision: null, + endIndexable: null, + endZoneSpecified: null, + endUtcDateTimeString: null, + instantDescriptor: null, + instantPrecision: null, + instantIndexable: null, + instantZoneSpecified: null, + instantUtcDateTimeString: null, + rangeDescriptor: null + ] + + indexedRecord.errors.size() == 1 + indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired + source: "valid field" + ] } //////////////////////////////// From 0e61e83dd84eedd7506bdff00573809385680878 Mon Sep 17 00:00:00 2001 From: Zeb Date: Mon, 1 Jun 2020 10:03:03 -0600 Subject: [PATCH 13/29] Changed method signatures to index while reporting un-indexable fields and removing them. --- .../elastic/common/ElasticsearchConfig.java | 6 +- .../onestop/indexer/util/IndexingInput.java | 32 +++- .../onestop/indexer/util/IndexingUtils.java | 6 +- .../indexer/util/TransformationUtils.java | 117 ++++++-------- .../util/TransformationUtilsSpec.groovy | 148 ++++-------------- 5 files changed, 112 insertions(+), 197 deletions(-) diff --git a/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java b/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java index 2910d2502..85d69488a 100644 --- a/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java +++ b/elastic-common/src/main/java/org/cedar/onestop/elastic/common/ElasticsearchConfig.java @@ -119,12 +119,12 @@ public String jsonMapping(String alias) { return this.jsonMappings.getOrDefault(alias, null); } - public Map parsedMapping(String alias) { + public Map parsedMapping(String alias) { // retrieve JSON mapping for index alias return this.parsedMappings.getOrDefault(alias, Collections.emptyMap()); } - public Map indexedProperties(String alias) { + public Map indexedProperties(String alias) { var parsed = (Map) parsedMapping(alias); var mappings = (Map) parsed.getOrDefault("mappings", Collections.emptyMap()); return (Map) mappings.getOrDefault("properties", Collections.emptyMap()); @@ -168,4 +168,4 @@ public Boolean sitemapEnabled() { return SITEMAP_ENABLED; } -} \ No newline at end of file +} diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java index f5ba13220..a1f46e046 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java @@ -52,23 +52,43 @@ public String getTargetAnalysisAndErrorsIndex() { return esConfig.analysisAndErrorsAliasFromType(recordType.toString()); } - public Set getTargetSearchIndexFields() { + public static Map getUnmappedAnalysisAndErrorsIndexFields() { + // this method is just to prevent us from logging warnings about fields in the analysis schema that we know and choose not to map + Map knownUnmappedTemporalFields = new HashMap(); + knownUnmappedTemporalFields.put("beginYear", new HashMap()); + knownUnmappedTemporalFields.put("beginDayOfYear", new HashMap()); + knownUnmappedTemporalFields.put("beginDayOfMonth", new HashMap()); + knownUnmappedTemporalFields.put("beginMonth", new HashMap()); + knownUnmappedTemporalFields.put("endYear", new HashMap()); + knownUnmappedTemporalFields.put("endDayOfYear", new HashMap()); + knownUnmappedTemporalFields.put("endDayOfMonth", new HashMap()); + knownUnmappedTemporalFields.put("endMonth", new HashMap()); + knownUnmappedTemporalFields.put("instantYear", new HashMap()); + knownUnmappedTemporalFields.put("instantDayOfYear", new HashMap()); + knownUnmappedTemporalFields.put("instantDayOfMonth", new HashMap()); + knownUnmappedTemporalFields.put("instantMonth", new HashMap()); + Map knownUnmappedFields = new HashMap(); + knownUnmappedFields.put("temporalBounding", knownUnmappedTemporalFields); + return knownUnmappedFields; + } + + public Map getTargetSearchIndexFields() { var searchAlias = esConfig.searchAliasFromType(recordType.toString()); if(searchAlias != null) { - return esConfig.indexedProperties(searchAlias).keySet(); + return esConfig.indexedProperties(searchAlias); } else { - return new HashSet<>(); + return new HashMap<>(); } } - public Set getTargetAnalysisAndErrorsIndexFields() { + public Map getTargetAnalysisAndErrorsIndexFields() { var aeAlias = esConfig.analysisAndErrorsAliasFromType(recordType.toString()); if(aeAlias != null) { - return esConfig.indexedProperties(aeAlias).keySet(); + return esConfig.indexedProperties(aeAlias); } else { - return new HashSet<>(); + return new HashMap<>(); } } diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java index 3be1903ed..e528071b8 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java @@ -78,9 +78,9 @@ public static DocWriteRequest buildAnalysisAndErrorWriteRequest(String indexN } else { var formattedRecord = new HashMap(); - log.info("build A&E write request "+input.getValue().value() +" and "+ input.getTargetAnalysisAndErrorsIndexFields()); - log.info("transforms to "+TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); - formattedRecord.putAll(TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); + // log.info("build A&E write request "+input.getValue().value() +" and "+ input.getTargetAnalysisAndErrorsIndexFields()); + // log.info("transforms to "+TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields(), input.getUnmappedAnalysisAndErrorsIndexFields())); // TODO change this to pass the ES mapping in instead + formattedRecord.putAll(TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields(), input.getUnmappedAnalysisAndErrorsIndexFields())); formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); } diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index 9cfb43edc..af7da4757 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -32,7 +32,7 @@ public class TransformationUtils { /////////////////////////////////////////////////////////////////////////////// // Indexing For Analysis & Errors // /////////////////////////////////////////////////////////////////////////////// - public static Map reformatMessageForAnalysisAndErrors(ParsedRecord record, Set targetFields) { + public static Map reformatMessageForAnalysisAndErrors(ParsedRecord record, Map targetFieldsMapping, Map knownUnmappedFields) { var analysis = record.getAnalysis(); var errors = record.getErrors(); @@ -46,34 +46,35 @@ public static Map reformatMessageForAnalysisAndErrors(ParsedReco analysisMap.put("errors", errorsList); // drop fields not present in target index - // TODO make recursive! - var result = new LinkedHashMap(targetFields.size()); - targetFields.forEach(f -> result.put(f, analysisMap.get(f))); - return result; - } - - public static Map unfilteredAEMessage(ParsedRecord record) { - var analysis = record.getAnalysis(); - var errors = record.getErrors(); - - var analysisMap = AvroUtils.avroToMap(analysis, true); - analysisMap.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); - var errorsList = errors.stream() - .map(e -> AvroUtils.avroToMap(e)) - .collect(Collectors.toList()); - - - var garbageError = new LinkedHashMap(); - garbageError.put("nonsense", "horrible"); - garbageError.put("source", "valid field" ); - errorsList.add(garbageError); - - analysisMap.put("errors", errorsList); - analysisMap.put("garbage", "nuke meeee"); // FIXME - return analysisMap; + var pruned = TransformationUtils.pruneKnownUnmappedFields(analysisMap, knownUnmappedFields); + var minus = TransformationUtils.identifyUnmappedFields(pruned, targetFieldsMapping); // TODO identify which it's going to + log.warn("The following fields were dropped when indexing to analysis and errors: " + minus); // TODO "add for record `id`" + return DataUtils.removeFromMap(pruned, minus); } + // public static Map unfilteredAEMessage(ParsedRecord record) { + // var analysis = record.getAnalysis(); + // var errors = record.getErrors(); + // + // var analysisMap = AvroUtils.avroToMap(analysis, true); + // analysisMap.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); + // var errorsList = errors.stream() + // .map(e -> AvroUtils.avroToMap(e)) + // .collect(Collectors.toList()); + // + // + // var garbageError = new LinkedHashMap(); + // garbageError.put("nonsense", "horrible"); + // garbageError.put("source", "valid field" ); + // errorsList.add(garbageError); + // + // + // analysisMap.put("errors", errorsList); + // analysisMap.put("garbage", "nuke meeee"); // FIXME + // return analysisMap; + // } + public static Map pruneKnownUnmappedFields(Map analysisMap, Map unmappedFields) { var result = new LinkedHashMap(); @@ -81,14 +82,13 @@ public static Map pruneKnownUnmappedFields(Map a if (!unmappedFields.containsKey(k)) { result.put(k, v); } else { - Map nestedProperties = (Map)((Map)unmappedFields.get(k)); // TODO almost identical to stuff to remove... but reversed... and no ".properties" layer... + Map nestedProperties = (Map)((Map)unmappedFields.get(k)); if (v instanceof Map) { result.put(k, pruneKnownUnmappedFields((Map) v, nestedProperties)); } else if (v instanceof List) { var list = ((List) v).stream().map(item -> pruneKnownUnmappedFields((Map) item, nestedProperties)).filter(item -> !((Map)item).isEmpty()) .collect(Collectors.toList()); - System.out.println("ZEB - list: "+list); result.put(k, list); } } @@ -96,67 +96,35 @@ public static Map pruneKnownUnmappedFields(Map a return result; } - public static Map identifyUnmappedFields(Map analysisMap, Map mapping) { + public static Map identifyUnmappedFields(Map analysisMap, Map mapping) { var result = new LinkedHashMap(); - // analysisMap.entrySet().stream().forEach(e -> { - // if( !mapping.containsKey(e.getKey())) { - // result.put(e.getKey(), e.getValue()); - // } else { - // if (e.getValue() instanceof Map){ - // System.out.println("ZEB: the value is a map!"); - // // System.out.println("mapping: "+mapping.get(e.getKey()).get("properties")); - // // System.out.println("--> "+identifyUnmappedFields((Map)e.getValue(), (Map)mapping.get(e.getKey()).get("properties"))); - // result.put(e.getKey(), identifyUnmappedFields((Map)e.getValue(), (Map)((Map)mapping.get(e.getKey())).get("properties"))); // TODO brute force assumes mapping is an object map string:object here too - // } else if(e.getValue() instanceof Collection){ - // // TODO!!!! - // // result.put(e.getKey(), ((Collection)e.getValue()).filter(item -> !identifyUnmappedFields((Map)item, (Map)((Map)mapping.get(e.getKey())).get("properties"))).isEmpty()); - // } - // } - // }); - // return result; - // + + if (mapping == null) { + return analysisMap; + } analysisMap.forEach((k, v) -> { if (!mapping.containsKey(k)) { result.put(k, v); } else { - Map nestedProperties = (Map)((Map)mapping.get(k)).get("properties"); // TODO assumes mapping is also a Map! - - // Map knownUnmapped = (Map)knownUnmappedFields.get(k); + Map nestedProperties = (Map)((Map)mapping.get(k)).get("properties"); // TODO assumes mapping is also a Map! if (v instanceof Map) { result.put(k, identifyUnmappedFields((Map) v, nestedProperties)); } else if (v instanceof List) { - var list = ((List) v).stream().map(item -> identifyUnmappedFields((Map) item, nestedProperties)).filter(item -> !((Map)item).isEmpty()) + var list = ((List) v).stream().filter(item -> item instanceof Map).map(item -> identifyUnmappedFields((Map) item, nestedProperties)).filter(item -> !((Map)item).isEmpty()) .collect(Collectors.toList()); - System.out.println("ZEB - list: "+list); result.put(k, list); } } }); return result; } -/* - toRemove.forEach((k, v) -> { - var originalValue = mergedMap.get(k); - if (v instanceof Map && originalValue instanceof Map) { - mergedMap.put(k, removeFromMap((Map) originalValue, (Map) v)); - } - else if (v instanceof List && originalValue instanceof List) { - var mergedList = new HashSet<>((List) originalValue); - mergedList.removeAll((List) v); - mergedMap.put(k, mergedList); - } - else if ((v == null && originalValue == null) || v.equals(originalValue)) { - mergedMap.remove(k); - } - }); -*/ /////////////////////////////////////////////////////////////////////////////// // Indexing For Search // /////////////////////////////////////////////////////////////////////////////// - public static Map reformatMessageForSearch(ParsedRecord record, Set targetFields) { + public static Map reformatMessageForSearch(ParsedRecord record, Map targetFieldsMapping) { var discovery = record.getDiscovery(); var analysis = record.getAnalysis(); var discoveryMap = AvroUtils.avroToMap(discovery, true); @@ -174,9 +142,16 @@ public static Map reformatMessageForSearch(ParsedRecord record, discoveryMap.put("checksums", prepareChecksums(record)); // drop fields not present in target index - var result = new LinkedHashMap(targetFields.size()); - targetFields.forEach(f -> result.put(f, discoveryMap.get(f))); - return result; + // // FIXME + // var result = new LinkedHashMap(targetFieldsMapping.size()); + // // targetFields.forEach(f -> result.put(f, discoveryMap.get(f))); + // return result; + + // var pruned = TransformationUtils.pruneKnownUnmappedFields(discoveryMap, knownUnmappedFields); + var pruned = discoveryMap; + var minus = TransformationUtils.identifyUnmappedFields(pruned, targetFieldsMapping); + log.warn("The following fields were dropped when indexing to search: " + minus); // TODO "add for record `id`" + return DataUtils.removeFromMap(pruned, minus); } //////////////////////////////// diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index e7f29ddf7..984af8e2c 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -25,9 +25,9 @@ import org.cedar.onestop.kafka.common.util.DataUtils; @Unroll class TransformationUtilsSpec extends Specification { - static collectionFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet() - static granuleFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet() - static granuleAnalysisErrorFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS).keySet() + static Map collectionFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS) + static Map granuleFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS) + static Map granuleAnalysisErrorFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS) static expectedKeywords = [ "SIO > Super Important Organization", @@ -80,35 +80,20 @@ class TransformationUtilsSpec extends Specification { /////////////////////////////// // Generic Indexed Fields // /////////////////////////////// - def "only mapped #type fields are indexed"() { - when: - def result = TransformationUtils.reformatMessageForSearch(record, fields) - - then: - result.keySet().each({ assert fields.contains(it) }) - - where: - type | fields | record - 'collection' | collectionFields | TestUtils.inputCollectionRecord - 'granule' | granuleFields | TestUtils.inputGranuleRecord - } - - def "only mapped nested fields are indexed"() { - when: - def result = TransformationUtils.reformatMessageForAnalysisAndErrors(TestUtils.inputGranuleRecord, granuleAnalysisErrorFields) - - - def asdf = TransformationUtils.identifyUnmappedFields(TransformationUtils.unfilteredAEMessage(TestUtils.inputGranuleRecord), TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) - - println("ZEB") - println(result) - println(JsonOutput.toJson(asdf)) - - then: - result.keySet().each({ assert granuleAnalysisErrorFields.contains(it) }) - } - - def "clean up nested map before indexing strictly mapped fields"() { + // def "only mapped #type fields are indexed"() { + // when: + // def result = TransformationUtils.reformatMessageForSearch(record, fields) + // + // then: + // result.keySet().each({ assert fields.keySet().contains(it) }) // TODO this is a shallow only check! + // + // where: + // type | fields | record + // 'collection' | collectionFields | TestUtils.inputCollectionRecord + // 'granule' | granuleFields | TestUtils.inputGranuleRecord + // } + + def "clean up nested map before indexing strictly mapped fields"() { // TODO change to use reformatMessageFor method when: def parsed = [ identification: null, @@ -160,21 +145,21 @@ class TransformationUtilsSpec extends Specification { - def knownUnmappedTemporalFields = new HashMap(); - knownUnmappedTemporalFields.put("beginYear", new HashMap()); - knownUnmappedTemporalFields.put("beginDayOfYear", new HashMap()); - knownUnmappedTemporalFields.put("beginDayOfMonth", new HashMap()); - knownUnmappedTemporalFields.put("beginMonth", new HashMap()); - knownUnmappedTemporalFields.put("endYear", new HashMap()); - knownUnmappedTemporalFields.put("endDayOfYear", new HashMap()); - knownUnmappedTemporalFields.put("endDayOfMonth", new HashMap()); - knownUnmappedTemporalFields.put("endMonth", new HashMap()); - knownUnmappedTemporalFields.put("instantYear", new HashMap()); - knownUnmappedTemporalFields.put("instantDayOfYear", new HashMap()); - knownUnmappedTemporalFields.put("instantDayOfMonth", new HashMap()); - knownUnmappedTemporalFields.put("instantMonth", new HashMap()); - def knownUnmappedFields = new HashMap(); - knownUnmappedFields.put("temporalBounding", knownUnmappedTemporalFields); + // def knownUnmappedTemporalFields = new HashMap(); + // knownUnmappedTemporalFields.put("beginYear", new HashMap()); + // knownUnmappedTemporalFields.put("beginDayOfYear", new HashMap()); + // knownUnmappedTemporalFields.put("beginDayOfMonth", new HashMap()); + // knownUnmappedTemporalFields.put("beginMonth", new HashMap()); + // knownUnmappedTemporalFields.put("endYear", new HashMap()); + // knownUnmappedTemporalFields.put("endDayOfYear", new HashMap()); + // knownUnmappedTemporalFields.put("endDayOfMonth", new HashMap()); + // knownUnmappedTemporalFields.put("endMonth", new HashMap()); + // knownUnmappedTemporalFields.put("instantYear", new HashMap()); + // knownUnmappedTemporalFields.put("instantDayOfYear", new HashMap()); + // knownUnmappedTemporalFields.put("instantDayOfMonth", new HashMap()); + // knownUnmappedTemporalFields.put("instantMonth", new HashMap()); + // def knownUnmappedFields = new HashMap(); + // knownUnmappedFields.put("temporalBounding", knownUnmappedTemporalFields); // ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) // .setAnalysis( @@ -195,7 +180,7 @@ class TransformationUtilsSpec extends Specification { // def parsed = TransformationUtils.unfilteredAEMessage(record) println("parsed "+JsonOutput.toJson(parsed)) - def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, knownUnmappedFields) + def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) println("pruned unampped? "+JsonOutput.toJson(pruned)) def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) println("creates minus: "+JsonOutput.toJson(minus)) @@ -215,71 +200,6 @@ class TransformationUtilsSpec extends Specification { garbage:"nuke meeee" ] - // println("wtf"+JsonOutput.toJson(indexedRecord)) - // println("wtf"+JsonOutput.toJson([ - // identification: null, - // titles: null, - // description: null, - // dataAccess: null, - // thumbnail: null, - // temporalBounding: [ - // beginDescriptor: ValidDescriptor.VALID, - // beginPrecision: ChronoUnit.DAYS.toString(), - // beginIndexable: true, - // beginZoneSpecified: null, - // beginUtcDateTimeString: "2000-02-01", - // endDescriptor: null, - // endPrecision: null, - // endIndexable: null, - // endZoneSpecified: null, - // endUtcDateTimeString: null, - // instantDescriptor: null, - // instantPrecision: null, - // instantIndexable: null, - // instantZoneSpecified: null, - // instantUtcDateTimeString: null, - // rangeDescriptor: null - // ], - // spatialBounding: null, - // internalParentIdentifier: null, - // errors: [ - // [nonsense:"horrible", - // source: "valid field" - // ] - // ] - // ])) - // assert indexedRecord == [ - // identification: null, - // titles: null, - // description: null, - // dataAccess: null, - // thumbnail: null, - // temporalBounding: [ - // beginDescriptor: ValidDescriptor.VALID, - // beginPrecision: ChronoUnit.DAYS.toString(), - // beginIndexable: true, - // beginZoneSpecified: null, - // beginUtcDateTimeString: "2000-02-01", - // endDescriptor: null, - // endPrecision: null, - // endIndexable: null, - // endZoneSpecified: null, - // endUtcDateTimeString: null, - // instantDescriptor: null, - // instantPrecision: null, - // instantIndexable: null, - // instantZoneSpecified: null, - // instantUtcDateTimeString: null, - // rangeDescriptor: null - // ], - // spatialBounding: null, - // internalParentIdentifier: null, - // errors: [ - // [nonsense:"horrible", // FIXME this is not actually desired - // source: "valid field" - // ] - // ] - // ] def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "internalParentIdentifier", "errors" ] indexedRecord.keySet().size() == expectedKeyset.size() indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) @@ -577,7 +497,7 @@ class TransformationUtilsSpec extends Specification { def "accession values are not included"() { when: - def result = TransformationUtils.reformatMessageForSearch(TestUtils.inputAvroRecord, TestUtils.esConfig.parsedMapping(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet()) + def result = TransformationUtils.reformatMessageForSearch(TestUtils.inputAvroRecord, collectionFields) then: result.accessionValues == null From a82f22459d335182f84486a309d7d5556f8ad637 Mon Sep 17 00:00:00 2001 From: Zeb Date: Mon, 1 Jun 2020 11:38:40 -0600 Subject: [PATCH 14/29] more unit tests --- .../indexer/util/TransformationUtils.java | 33 ++- .../util/TransformationUtilsSpec.groovy | 272 ++++++++++++++++-- 2 files changed, 266 insertions(+), 39 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index af7da4757..f4d586b11 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -396,23 +396,26 @@ private static Map prepareDates(TemporalBounding bounding, Tempo private static HashMap parseAdditionalTimeFields(String prefix, String time){ var result = new HashMap(); - Integer dayOfYear, dayOfMonth, month; - if (time != null) { - ZonedDateTime dateTime = ZonedDateTime.parse(time); + try { - dayOfYear = dateTime.getDayOfYear(); - dayOfMonth = dateTime.getDayOfMonth(); - month = dateTime.getMonthValue(); - } - else { - dayOfYear = null; - dayOfMonth = null; - month = null; - } + Integer dayOfYear, dayOfMonth, month; + if (time != null) { + ZonedDateTime dateTime = ZonedDateTime.parse(time); + + dayOfYear = dateTime.getDayOfYear(); + dayOfMonth = dateTime.getDayOfMonth(); + month = dateTime.getMonthValue(); + } + else { + dayOfYear = null; + dayOfMonth = null; + month = null; + } - result.put(prefix + "DayOfYear", dayOfYear); - result.put(prefix + "DayOfMonth", dayOfMonth); - result.put(prefix + "Month", month); + result.put(prefix + "DayOfYear", dayOfYear); + result.put(prefix + "DayOfMonth", dayOfMonth); + result.put(prefix + "Month", month); + } catch (Exception e) {} // TODO temporary return result; } diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index 984af8e2c..a1b95f592 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -93,10 +93,140 @@ class TransformationUtilsSpec extends Specification { // 'granule' | granuleFields | TestUtils.inputGranuleRecord // } - def "clean up nested map before indexing strictly mapped fields"() { // TODO change to use reformatMessageFor method + + def "clean up nested map before indexing strictly mapped fields for search (granule)"() { + when: + // def parsed = [ + // identification: null, + // titles: null, + // description: null, + // dataAccess: null, + // thumbnail: null, + // temporalBounding: [ + // beginDescriptor: ValidDescriptor.VALID, + // beginPrecision: ChronoUnit.DAYS.toString(), + // beginIndexable: true, + // beginZoneSpecified: null, + // beginUtcDateTimeString: "2000-02-01", + // beginYear: 2000, + // beginDayOfYear: 32, + // beginDayOfMonth: 1, + // beginMonth: 2, + // endDescriptor: null, + // endPrecision: null, + // endIndexable: null, + // endZoneSpecified: null, + // endUtcDateTimeString: null, + // endYear: null, + // endDayOfYear: null, + // endDayOfMonth: null, + // endMonth: null, + // instantDescriptor: null, + // instantPrecision: null, + // instantIndexable: null, + // instantZoneSpecified: null, + // instantUtcDateTimeString: null, + // instantYear: null, + // instantDayOfYear: null, + // instantDayOfMonth: null, + // instantMonth: null, + // rangeDescriptor: null, + // fakeField: 123 + // ], + // spatialBounding: null, + // internalParentIdentifier: null, + // errors: [ + // [ + // nonsense: "horrible", + // source: "valid field" + // ] + // ], + // garbage:"nuke meeee" + // ] + ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) + .setAnalysis( + Analysis.newBuilder().setTemporalBounding( + TemporalBoundingAnalysis.newBuilder() + .setBeginDescriptor(ValidDescriptor.VALID) + .setBeginIndexable(true) + .setBeginPrecision(ChronoUnit.DAYS.toString()) + .setBeginZoneSpecified(null) + .setBeginUtcDateTimeString("2000-02-01") + .setBeginYear(2000) + .setBeginMonth(2) + .setBeginDayOfYear(32) + .setBeginDayOfMonth(1) + .build() + ).build()).build() + + + // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) + def indexedRecord = TransformationUtils.reformatMessageForSearch(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS)) + // def indexedRecord = DataUtils.removeFromMap(pruned, minus) + + then: + // minus == [ + // temporalBounding: [ + // fakeField: 123 + // ], + // errors: [ + // [ + // nonsense: "horrible", + // ] + // ], + // garbage:"nuke meeee" + // ] + + def expectedKeyset = ["fileIdentifier", "parentIdentifier", "doi", "title", "description", "keywords", "topicCategories", "temporalBounding", "spatialBounding", "isGlobal", "acquisitionInstruments", "acquisitionOperations", "acquisitionPlatforms", "dataFormats", "links", "responsibleParties", "thumbnail", "citeAsStatements", "crossReferences", "largerWorks", "legalConstraints", "services", "gcmdVerticalResolution", "gcmdDataCenters", "gcmdTemporalResolution", "gcmdLocations", "gcmdScience", "beginDate", "endDate", "endDayOfYear", "beginYear", "endMonth", "endYear", "endDayOfMonth", "dataFormat", "linkProtocol", "serviceLinks", "serviceLinkProtocol", "internalParentIdentifier", "filename", "checksums"] + + + indexedRecord.keySet().size() == expectedKeyset.size() + indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) + + } + + + def "clean up nested map before indexing strictly mapped fields for search (collection)"() { + when: + + ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) + .setAnalysis( + Analysis.newBuilder().setTemporalBounding( + TemporalBoundingAnalysis.newBuilder() + .setBeginDescriptor(ValidDescriptor.VALID) + .setBeginIndexable(true) + .setBeginPrecision(ChronoUnit.DAYS.toString()) + .setBeginZoneSpecified(null) + .setBeginUtcDateTimeString("2000-02-01") + .setBeginYear(2000) + .setBeginMonth(2) + .setBeginDayOfYear(32) + .setBeginDayOfMonth(1) + .build() + ).build()).build() + + + // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) + def indexedRecord = TransformationUtils.reformatMessageForSearch(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS)) + // def indexedRecord = DataUtils.removeFromMap(pruned, minus) + + then: + + def expectedKeyset = ["fileIdentifier", "parentIdentifier", "doi", "title", "description", "keywords", "topicCategories", "temporalBounding", "spatialBounding", "isGlobal", "acquisitionInstruments", "acquisitionOperations", "acquisitionPlatforms", "dataFormats", "links", "responsibleParties", "thumbnail", "citeAsStatements", "crossReferences", "largerWorks", "useLimitation", "legalConstraints", "accessFeeStatement", "orderingInstructions", "edition", "dsmmAverage", "services", "gcmdVerticalResolution", "gcmdDataCenters", "gcmdTemporalResolution", "gcmdLocations", "gcmdScience", "beginDate", "endDate", "endDayOfYear", "beginYear", "endMonth", "endYear", "endDayOfMonth", "dataFormat", "linkProtocol", "serviceLinks", "serviceLinkProtocol", "organizationNames", + "individualNames", "checksums"] + + + indexedRecord.keySet().size() == expectedKeyset.size() + expectedKeyset.each({ assert indexedRecord.keySet().contains(it) }) + indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) + + } + + def "clean up nested map before indexing strictly mapped fields for analysis and errors (granule)"() { // TODO change to use reformatMessageFor method when: def parsed = [ identification: null, + internalParentIdentifier: null, titles: null, description: null, dataAccess: null, @@ -133,7 +263,6 @@ class TransformationUtilsSpec extends Specification { fakeField: 123 ], spatialBounding: null, - internalParentIdentifier: null, errors: [ [ nonsense: "horrible", @@ -143,24 +272,6 @@ class TransformationUtilsSpec extends Specification { garbage:"nuke meeee" ] - - - // def knownUnmappedTemporalFields = new HashMap(); - // knownUnmappedTemporalFields.put("beginYear", new HashMap()); - // knownUnmappedTemporalFields.put("beginDayOfYear", new HashMap()); - // knownUnmappedTemporalFields.put("beginDayOfMonth", new HashMap()); - // knownUnmappedTemporalFields.put("beginMonth", new HashMap()); - // knownUnmappedTemporalFields.put("endYear", new HashMap()); - // knownUnmappedTemporalFields.put("endDayOfYear", new HashMap()); - // knownUnmappedTemporalFields.put("endDayOfMonth", new HashMap()); - // knownUnmappedTemporalFields.put("endMonth", new HashMap()); - // knownUnmappedTemporalFields.put("instantYear", new HashMap()); - // knownUnmappedTemporalFields.put("instantDayOfYear", new HashMap()); - // knownUnmappedTemporalFields.put("instantDayOfMonth", new HashMap()); - // knownUnmappedTemporalFields.put("instantMonth", new HashMap()); - // def knownUnmappedFields = new HashMap(); - // knownUnmappedFields.put("temporalBounding", knownUnmappedTemporalFields); - // ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) // .setAnalysis( // Analysis.newBuilder().setTemporalBounding( @@ -179,13 +290,9 @@ class TransformationUtilsSpec extends Specification { // def parsed = TransformationUtils.unfilteredAEMessage(record) - println("parsed "+JsonOutput.toJson(parsed)) def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - println("pruned unampped? "+JsonOutput.toJson(pruned)) def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) - println("creates minus: "+JsonOutput.toJson(minus)) def indexedRecord = DataUtils.removeFromMap(pruned, minus) - println("which results in indexing: "+ JsonOutput.toJson(indexedRecord)) then: minus == [ @@ -227,8 +334,125 @@ class TransformationUtilsSpec extends Specification { indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired source: "valid field" ] + } + def "clean up nested map before indexing strictly mapped fields for analysis and errors (collection)"() { // TODO change to use reformatMessageFor method + when: + def parsed = [ + identification: null, + internalParentIdentifier: null, + titles: null, + description: null, + dataAccess: null, + thumbnail: null, + temporalBounding: [ + beginDescriptor: ValidDescriptor.VALID, + beginPrecision: ChronoUnit.DAYS.toString(), + beginIndexable: true, + beginZoneSpecified: null, + beginUtcDateTimeString: "2000-02-01", + beginYear: 2000, + beginDayOfYear: 32, + beginDayOfMonth: 1, + beginMonth: 2, + endDescriptor: null, + endPrecision: null, + endIndexable: null, + endZoneSpecified: null, + endUtcDateTimeString: null, + endYear: null, + endDayOfYear: null, + endDayOfMonth: null, + endMonth: null, + instantDescriptor: null, + instantPrecision: null, + instantIndexable: null, + instantZoneSpecified: null, + instantUtcDateTimeString: null, + instantYear: null, + instantDayOfYear: null, + instantDayOfMonth: null, + instantMonth: null, + rangeDescriptor: null, + fakeField: 123 + ], + spatialBounding: null, + errors: [ + [ + nonsense: "horrible", + source: "valid field" + ] + ], + garbage:"nuke meeee" + ] + + // ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) + // .setAnalysis( + // Analysis.newBuilder().setTemporalBounding( + // TemporalBoundingAnalysis.newBuilder() + // .setBeginDescriptor(ValidDescriptor.VALID) + // .setBeginIndexable(true) + // .setBeginPrecision(ChronoUnit.DAYS.toString()) + // .setBeginZoneSpecified(null) + // .setBeginUtcDateTimeString("2000-02-01") + // .setBeginYear(2000) + // .setBeginMonth(2) + // .setBeginDayOfYear(32) + // .setBeginDayOfMonth(1) + // .build() + // ).build()).build() + + // def parsed = TransformationUtils.unfilteredAEMessage(record) + + def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) + def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_ERROR_AND_ANALYSIS_INDEX_ALIAS)) + def indexedRecord = DataUtils.removeFromMap(pruned, minus) + + then: + minus == [ + internalParentIdentifier: null, // ok for granule, not collection + temporalBounding: [ + fakeField: 123 + ], + errors: [ + [ + nonsense: "horrible", + ] + ], + garbage:"nuke meeee" + ] + + def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "errors" ] + indexedRecord.keySet().size() == expectedKeyset.size() + indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) + + indexedRecord.temporalBounding == [ + beginDescriptor: ValidDescriptor.VALID, + beginPrecision: ChronoUnit.DAYS.toString(), + beginIndexable: true, + beginZoneSpecified: null, + beginUtcDateTimeString: "2000-02-01", + endDescriptor: null, + endPrecision: null, + endIndexable: null, + endZoneSpecified: null, + endUtcDateTimeString: null, + instantDescriptor: null, + instantPrecision: null, + instantIndexable: null, + instantZoneSpecified: null, + instantUtcDateTimeString: null, + rangeDescriptor: null + ] + + indexedRecord.errors.size() == 1 + indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired + source: "valid field" + ] + + } + //////////////////////////////// // Identifiers, "Names" // //////////////////////////////// From 3a8fcf0be01cf75e1dc11151928759e577ebe721 Mon Sep 17 00:00:00 2001 From: Zeb Date: Fri, 5 Jun 2020 10:17:43 -0600 Subject: [PATCH 15/29] only add fields that should be there, transformed if needed --- .../onestop/indexer/util/IndexingInput.java | 50 +- .../onestop/indexer/util/IndexingUtils.java | 6 +- .../indexer/util/TransformationUtils.java | 152 +-- .../util/TransformationUtilsSpec.groovy | 925 +++++++++++++----- 4 files changed, 740 insertions(+), 393 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java index a1f46e046..fcfe59659 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java @@ -52,64 +52,26 @@ public String getTargetAnalysisAndErrorsIndex() { return esConfig.analysisAndErrorsAliasFromType(recordType.toString()); } - public static Map getUnmappedAnalysisAndErrorsIndexFields() { - // this method is just to prevent us from logging warnings about fields in the analysis schema that we know and choose not to map - Map knownUnmappedTemporalFields = new HashMap(); - knownUnmappedTemporalFields.put("beginYear", new HashMap()); - knownUnmappedTemporalFields.put("beginDayOfYear", new HashMap()); - knownUnmappedTemporalFields.put("beginDayOfMonth", new HashMap()); - knownUnmappedTemporalFields.put("beginMonth", new HashMap()); - knownUnmappedTemporalFields.put("endYear", new HashMap()); - knownUnmappedTemporalFields.put("endDayOfYear", new HashMap()); - knownUnmappedTemporalFields.put("endDayOfMonth", new HashMap()); - knownUnmappedTemporalFields.put("endMonth", new HashMap()); - knownUnmappedTemporalFields.put("instantYear", new HashMap()); - knownUnmappedTemporalFields.put("instantDayOfYear", new HashMap()); - knownUnmappedTemporalFields.put("instantDayOfMonth", new HashMap()); - knownUnmappedTemporalFields.put("instantMonth", new HashMap()); - Map knownUnmappedFields = new HashMap(); - knownUnmappedFields.put("temporalBounding", knownUnmappedTemporalFields); - return knownUnmappedFields; - } - - public Map getTargetSearchIndexFields() { + public Set getTargetSearchIndexFields() { var searchAlias = esConfig.searchAliasFromType(recordType.toString()); if(searchAlias != null) { - return esConfig.indexedProperties(searchAlias); + return esConfig.indexedProperties(searchAlias).keySet(); } else { - return new HashMap<>(); + return new HashSet<>(); } } - public Map getTargetAnalysisAndErrorsIndexFields() { + public Set getTargetAnalysisAndErrorsIndexFields() { var aeAlias = esConfig.analysisAndErrorsAliasFromType(recordType.toString()); if(aeAlias != null) { - return esConfig.indexedProperties(aeAlias); + return esConfig.indexedProperties(aeAlias).keySet(); } else { - return new HashMap<>(); + return new HashSet<>(); } } - // public Map getTargetAnalysisAndErrorsIndexMapping() { - // var aeAlias = esConfig.analysisAndErrorsAliasFromType(recordType.toString()); - // if(aeAlias != null) { - // return esConfig.indexedProperties(aeAlias); - // } - // else { - // return new HashMap<>(); - // } - // } - - // public static Map getNestedKeys(Map originalMap) { - // if (keysToKeep == null || keysToKeep.size() == 0) { - // return new HashMap<>(); - // } - // return originalMap.entrySet().stream() - // .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - // } - @Override public String toString() { return "IndexingInput {" + diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java index e528071b8..01a9a14e9 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java @@ -64,8 +64,6 @@ public static DocWriteRequest buildSearchWriteRequest(String indexName, DocWr } else { var formattedRecord = new HashMap(); - // log.info("build search write request "+input.getValue().value()+ " and "+input.getTargetSearchIndexFields()); - // log.info("transforms to "+TransformationUtils.reformatMessageForSearch(input.getValue().value(), input.getTargetSearchIndexFields())); formattedRecord.putAll(TransformationUtils.reformatMessageForSearch(input.getValue().value(), input.getTargetSearchIndexFields())); formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); @@ -78,9 +76,7 @@ public static DocWriteRequest buildAnalysisAndErrorWriteRequest(String indexN } else { var formattedRecord = new HashMap(); - // log.info("build A&E write request "+input.getValue().value() +" and "+ input.getTargetAnalysisAndErrorsIndexFields()); - // log.info("transforms to "+TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields(), input.getUnmappedAnalysisAndErrorsIndexFields())); // TODO change this to pass the ES mapping in instead - formattedRecord.putAll(TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields(), input.getUnmappedAnalysisAndErrorsIndexFields())); + formattedRecord.putAll(TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); } diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index f4d586b11..bf0f17d47 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -32,126 +32,76 @@ public class TransformationUtils { /////////////////////////////////////////////////////////////////////////////// // Indexing For Analysis & Errors // /////////////////////////////////////////////////////////////////////////////// - public static Map reformatMessageForAnalysisAndErrors(ParsedRecord record, Map targetFieldsMapping, Map knownUnmappedFields) { + public static Map reformatMessageForAnalysisAndErrors(ParsedRecord record, Set fields) { var analysis = record.getAnalysis(); var errors = record.getErrors(); var analysisMap = AvroUtils.avroToMap(analysis, true); - analysisMap.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); + var message = new HashMap(); + + fields.forEach(field -> { + message.put(field, analysisMap.get(field)); + }); + if (fields.contains("internalParentIdentifier")) { + analysisMap.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); + } var errorsList = errors.stream() .map(e -> AvroUtils.avroToMap(e)) .collect(Collectors.toList()); + message.put("errors", errorsList); - analysisMap.put("errors", errorsList); - - // drop fields not present in target index - - var pruned = TransformationUtils.pruneKnownUnmappedFields(analysisMap, knownUnmappedFields); - var minus = TransformationUtils.identifyUnmappedFields(pruned, targetFieldsMapping); // TODO identify which it's going to - log.warn("The following fields were dropped when indexing to analysis and errors: " + minus); // TODO "add for record `id`" - return DataUtils.removeFromMap(pruned, minus); - } - - // public static Map unfilteredAEMessage(ParsedRecord record) { - // var analysis = record.getAnalysis(); - // var errors = record.getErrors(); - // - // var analysisMap = AvroUtils.avroToMap(analysis, true); - // analysisMap.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); - // var errorsList = errors.stream() - // .map(e -> AvroUtils.avroToMap(e)) - // .collect(Collectors.toList()); - // - // - // var garbageError = new LinkedHashMap(); - // garbageError.put("nonsense", "horrible"); - // garbageError.put("source", "valid field" ); - // errorsList.add(garbageError); - // - // - // analysisMap.put("errors", errorsList); - // analysisMap.put("garbage", "nuke meeee"); // FIXME - // return analysisMap; - // } - - public static Map pruneKnownUnmappedFields(Map analysisMap, Map unmappedFields) { - - var result = new LinkedHashMap(); - analysisMap.forEach((k, v) -> { - if (!unmappedFields.containsKey(k)) { - result.put(k, v); - } else { - Map nestedProperties = (Map)((Map)unmappedFields.get(k)); - - if (v instanceof Map) { - result.put(k, pruneKnownUnmappedFields((Map) v, nestedProperties)); - } else if (v instanceof List) { - var list = ((List) v).stream().map(item -> pruneKnownUnmappedFields((Map) item, nestedProperties)).filter(item -> !((Map)item).isEmpty()) - .collect(Collectors.toList()); - result.put(k, list); - } - } - }); - return result; - } - - public static Map identifyUnmappedFields(Map analysisMap, Map mapping) { - var result = new LinkedHashMap(); - - if (mapping == null) { - return analysisMap; - } - - analysisMap.forEach((k, v) -> { - if (!mapping.containsKey(k)) { - result.put(k, v); - } else { - Map nestedProperties = (Map)((Map)mapping.get(k)).get("properties"); // TODO assumes mapping is also a Map! - - if (v instanceof Map) { - result.put(k, identifyUnmappedFields((Map) v, nestedProperties)); - } else if (v instanceof List) { - var list = ((List) v).stream().filter(item -> item instanceof Map).map(item -> identifyUnmappedFields((Map) item, nestedProperties)).filter(item -> !((Map)item).isEmpty()) - .collect(Collectors.toList()); - result.put(k, list); - } - } - }); - return result; + return message; } /////////////////////////////////////////////////////////////////////////////// // Indexing For Search // /////////////////////////////////////////////////////////////////////////////// - public static Map reformatMessageForSearch(ParsedRecord record, Map targetFieldsMapping) { + public static Map reformatMessageForSearch(ParsedRecord record, Set fields) { var discovery = record.getDiscovery(); var analysis = record.getAnalysis(); var discoveryMap = AvroUtils.avroToMap(discovery, true); + var message = new HashMap(); + fields.forEach(field -> { + message.put(field, discoveryMap.get(field)); + }); // prepare and apply fields that need to be reformatted for search - discoveryMap.putAll(prepareGcmdKeyword(discovery)); - discoveryMap.putAll(prepareDates(discovery.getTemporalBounding(), analysis.getTemporalBounding())); - discoveryMap.put("dataFormat", prepareDataFormats(discovery)); - discoveryMap.put("linkProtocol", prepareLinkProtocols(discovery)); - discoveryMap.put("serviceLinks", prepareServiceLinks(discovery)); - discoveryMap.put("serviceLinkProtocol", prepareServiceLinkProtocols(discovery)); - discoveryMap.putAll(prepareResponsibleParties(record)); - discoveryMap.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); - discoveryMap.put("filename", prepareFilename(record)); - discoveryMap.put("checksums", prepareChecksums(record)); - - // drop fields not present in target index - // // FIXME - // var result = new LinkedHashMap(targetFieldsMapping.size()); - // // targetFields.forEach(f -> result.put(f, discoveryMap.get(f))); - // return result; - - // var pruned = TransformationUtils.pruneKnownUnmappedFields(discoveryMap, knownUnmappedFields); - var pruned = discoveryMap; - var minus = TransformationUtils.identifyUnmappedFields(pruned, targetFieldsMapping); - log.warn("The following fields were dropped when indexing to search: " + minus); // TODO "add for record `id`" - return DataUtils.removeFromMap(pruned, minus); + message.putAll(prepareGcmdKeyword(discovery));// TODO does this need and iff? + message.putAll(prepareDates(discovery.getTemporalBounding(), analysis.getTemporalBounding())); // TODO does this need and iff? + if (fields.contains("dataFormat")) { + message.put("dataFormat", prepareDataFormats(discovery)); + } + if (fields.contains("linkProtocol")) { + message.put("linkProtocol", prepareLinkProtocols(discovery)); + } + if (fields.contains("serviceLinks")) { + message.put("serviceLinks", prepareServiceLinks(discovery)); + } + if (fields.contains("serviceLinkProtocol")) { + message.put("serviceLinkProtocol", prepareServiceLinkProtocols(discovery)); + } + Map> responsibleParties = prepareResponsibleParties(record); + responsibleParties.forEach((key, value) -> { + if (fields.contains(key)) { + message.put(key, value); + } + }); + + if (fields.contains("internalParentIdentifier")) { + message.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); + } + if (fields.contains("filename")) { + message.put("filename", prepareFilename(record)); + } + if (fields.contains("checksums")) { + message.put("checksums", prepareChecksums(record)); + log.info("ZEB - including checksums (presumed granule)"); + } else { + log.info("ZEB - excluding checksums (presumed collection)"); + } + + return message; } //////////////////////////////// diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index a1b95f592..83ad4575b 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -5,6 +5,8 @@ import org.cedar.schemas.analyze.Temporal import org.cedar.schemas.avro.psi.Analysis import org.cedar.schemas.avro.psi.TemporalBoundingAnalysis import org.cedar.schemas.avro.psi.ValidDescriptor +import org.cedar.schemas.avro.psi.Checksum +import org.cedar.schemas.avro.psi.ChecksumAlgorithm import org.cedar.schemas.avro.psi.Discovery import org.cedar.schemas.avro.psi.FileInformation import org.cedar.schemas.avro.psi.ParsedRecord @@ -17,6 +19,8 @@ import spock.lang.Specification import spock.lang.Unroll import groovy.json.JsonOutput +import groovy.json.JsonSlurper +import org.cedar.schemas.avro.util.AvroUtils import static org.cedar.schemas.avro.util.TemporalTestData.getSituations @@ -25,9 +29,9 @@ import org.cedar.onestop.kafka.common.util.DataUtils; @Unroll class TransformationUtilsSpec extends Specification { - static Map collectionFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS) - static Map granuleFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS) - static Map granuleAnalysisErrorFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS) + static Set collectionFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet() + static Set granuleFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet() + static Set granuleAnalysisErrorFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS).keySet() static expectedKeywords = [ "SIO > Super Important Organization", @@ -93,6 +97,24 @@ class TransformationUtilsSpec extends Specification { // 'granule' | granuleFields | TestUtils.inputGranuleRecord // } + def "why is it complaining about checksums #label"() { + when: + + ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) + .setFileInformation(FileInformation.newBuilder().setChecksums([Checksum.newBuilder().setAlgorithm(ChecksumAlgorithm.MD5).setValue('abc').build()]).build()).build() + + def indexedRecord = TransformationUtils.reformatMessageForSearch(record, fields) + + then: + + indexedRecord.keySet().contains("checksums") == shouldIncludeChecksums + + where: + label | shouldIncludeChecksums | fields + 'collections' | false | TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet() + 'granules' | true | TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet() + + } def "clean up nested map before indexing strictly mapped fields for search (granule)"() { when: @@ -161,7 +183,7 @@ class TransformationUtilsSpec extends Specification { // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - def indexedRecord = TransformationUtils.reformatMessageForSearch(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS)) + def indexedRecord = TransformationUtils.reformatMessageForSearch(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet()) // def indexedRecord = DataUtils.removeFromMap(pruned, minus) then: @@ -177,14 +199,431 @@ class TransformationUtilsSpec extends Specification { // garbage:"nuke meeee" // ] - def expectedKeyset = ["fileIdentifier", "parentIdentifier", "doi", "title", "description", "keywords", "topicCategories", "temporalBounding", "spatialBounding", "isGlobal", "acquisitionInstruments", "acquisitionOperations", "acquisitionPlatforms", "dataFormats", "links", "responsibleParties", "thumbnail", "citeAsStatements", "crossReferences", "largerWorks", "legalConstraints", "services", "gcmdVerticalResolution", "gcmdDataCenters", "gcmdTemporalResolution", "gcmdLocations", "gcmdScience", "beginDate", "endDate", "endDayOfYear", "beginYear", "endMonth", "endYear", "endDayOfMonth", "dataFormat", "linkProtocol", "serviceLinks", "serviceLinkProtocol", "internalParentIdentifier", "filename", "checksums"] + // def expectedKeyset = ["fileIdentifier", "parentIdentifier", "doi", "title", "description", "keywords", "topicCategories", "temporalBounding", "spatialBounding", "isGlobal", "acquisitionInstruments", "acquisitionOperations", "acquisitionPlatforms", "dataFormats", "links", "responsibleParties", "thumbnail", "citeAsStatements", "crossReferences", "largerWorks", "legalConstraints", "services", "gcmdVerticalResolution", "gcmdDataCenters", "gcmdTemporalResolution", "gcmdLocations", "gcmdScience", "beginDate", "endDate", "endDayOfYear", "beginYear", "endMonth", "endYear", "endDayOfMonth", "dataFormat", "linkProtocol", "serviceLinks", "serviceLinkProtocol", "internalParentIdentifier", "filename", "checksums"] - indexedRecord.keySet().size() == expectedKeyset.size() - indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) + indexedRecord.keySet().size() == granuleFields.size() + indexedRecord.keySet().each({ assert granuleFields.contains(it) }) } + // def "prune fields - spatial"() { + // when: + // def mapWithSpatial = [ + // spatialBounding: [ + // type: "MultiPolygon", + // coordinates: [ + // [ + // [ + // [-180.0, -14.28], + // [-61.821, -14.28], + // [-61.821, 70.4], + // [-180.0, 70.4], + // [-180.0, -14.28] + // ] + // ], + // [ + // [ + // [144.657, -14.28], + // [180.0, -14.28], + // [180.0, 70.4], + // [144.657, 70.4], + // [144.657, -14.28] + // ] + // ] + // ] + // ] + // ] + // def minus = TransformationUtils.identifyUnmappedFields(mapWithSpatial, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS)) + // + // then: + // minus == [] + // } + +// def "debug integration" () { +// when: +// def jsonrecord = (new JsonSlurper()).parseText("""{ +// "type": "collection", +// "discovery": { +// "fileIdentifier": "gov.noaa.nodc:NDBC-COOPS", +// "parentIdentifier": null, +// "hierarchyLevelName": null, +// "doi": "doi:10.5072/FK2TEST", +// "purpose": "Basic research", +// "status": "completed", +// "credit": null, +// "title": "Coastal meteorological and water temperature data from National Water Level Observation Network (NWLON) and Physical Oceanographic Real-Time System (PORTS) stations of the NOAA Center for Operational Oceanographic Products and Services (CO-OPS)", +// "alternateTitle": null, +// "description": "The National Water Level Observation Network (NWLON) is a network of long-term water level stations operated and maintained by CO-OPS. NWLON stations are located on shore-based platforms, and primarily collect real-time water level measurements. As of January 2013, approximately 180 of 210 NWLON stations also collect real-time meteorological data. About 20 CO-OPS Physical Oceanographic Real-Time Systems (PORTS) comprise a group of water level stations, and 65 of these stations also collect real-time meteorological data. Data parameters include barometric pressure, wind direction, speed and gust, air temperature, and water temperature.", +// "keywords": [{ +// "values": ["DOC/NOAA/NESDIS/NODC > National Oceanographic Data Center, NESDIS, NOAA, U.S. Department of Commerce", "DOC/NOAA/NESDIS/NCEI > National Centers for Environmental Information, NESDIS, NOAA, U.S. Department of Commerce"], +// "type": "dataCenter", +// "namespace": "GCMD Keywords - Data Centers" +// }, { +// "values": ["0107939", "0108059", "0109292", "0111163", "0112393", "0113250", "0113898", "0114473", "0115274", "0115910", "0116703", "0117348", "0117811", "0118682", "0120725", "0120726", "0122183", "0122220", "0123085", "0123363", "0124305", "0125493", "0126410", "0126781", "0127407", "0128443", "0129526", "0130004", "0131097", "0131931", "0137308", "0138303", "0139574", "0141136", "0144301", "0145770", "0148198", "0151779", "0154391", "0155989"], +// "type": null, +// "namespace": "NCEI ACCESSION NUMBER" +// }, { +// "values": ["AIR TEMPERATURE", "BAROMETRIC PRESSURE", "DEWPOINT", "RELATIVE HUMIDITY", "SEA SURFACE TEMPERATURE", "VISIBILITY", "WIND DIRECTION", "WIND GUST", "WIND SPEED"], +// "type": "theme", +// "namespace": "NODC DATA TYPES THESAURUS" +// }, { +// "values": ["anemometer", "barometers", "meteorological sensors", "thermistor"], +// "type": "instrument", +// "namespace": "NODC INSTRUMENT TYPES THESAURUS" +// }, { +// "values": ["meteorological", "physical"], +// "type": "theme", +// "namespace": "NODC OBSERVATION TYPES THESAURUS" +// }, { +// "values": ["FIXED PLATFORM"], +// "type": "platform", +// "namespace": "NODC PLATFORM NAMES THESAURUS" +// }, { +// "values": ["US DOC; NOAA; NOS; Center for Operational Oceanographic Products and Services"], +// "type": "dataCenter", +// "namespace": "NODC COLLECTING INSTITUTION NAMES THESAURUS" +// }, { +// "values": ["US DOC; NOAA; NWS; National Data Buoy Center"], +// "type": "dataCenter", +// "namespace": "NODC SUBMITTING INSTITUTION NAMES THESAURUS" +// }, { +// "values": ["National Water Level Observation Network (NWLON)", "Physical Oceanographic Real-Time System (PORTS)"], +// "type": "project", +// "namespace": "NODC PROJECT NAMES THESAURUS" +// }, { +// "values": ["Bay of Fundy", "Beaufort Sea", "Bering Sea", "Caribbean Sea", "Coastal waters of Alabama", "Coastal Waters of Florida", "Coastal Waters of Louisiana", "Coastal Waters of Mississippi", "Coastal Waters of Southeast Alaska and British Columbia", "Coastal Waters of Texas", "Florida Keys National Marine Sanctuary", "Great Lakes", "Gulf of Alaska", "Gulf of Mexico", "Kaneohe Bay", "Monterey Bay National Marine Sanctuary", "North Atlantic Ocean", "North Pacific Ocean", "Papahanaumokuakea Marine National Monument", "Philippine Sea", "San Diego Bay", "South Pacific Ocean", "Yaquina Bay"], +// "type": "place", +// "namespace": "NODC SEA AREA NAMES THESAURUS" +// }, { +// "values": ["oceanography"], +// "type": "theme", +// "namespace": "WMO_CategoryCode" +// }, { +// "values": ["GOVERNMENT AGENCIES-U.S. FEDERAL AGENCIES > DOC > NOAA > DOC/NOAA/NOS/CO-OPS > Center for Operational Oceanographic Products and Services, National Ocean Service, NOAA, U.S. Department of Commerce > http://tidesandcurrents.noaa.gov/", "GOVERNMENT AGENCIES-U.S. FEDERAL AGENCIES > DOC > NOAA > DOC/NOAA/NWS/NDBC > National Data Buoy Center, National Weather Service, NOAA, U.S. Department of Commerce > http://www.ndbc.noaa.gov/"], +// "type": "dataCenter", +// "namespace": "GCMD Keywords - Data Centers" +// }, { +// "values": ["EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC PRESSURE", "EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC TEMPERATURE", "EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC TEMPERATURE > SURFACE TEMPERATURE > DEW POINT TEMPERATURE", "EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WATER VAPOR > HUMIDITY", "EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > WIND SPEED/WIND DIRECTION", "EARTH SCIENCE > OCEANS > OCEAN OPTICS", "EARTH SCIENCE > OCEANS > OCEAN TEMPERATURE > SEA SURFACE TEMPERATURE"], +// "type": "theme", +// "namespace": "GCMD Keywords - Science Keywords" +// }, { +// "values": ["In Situ/Laboratory Instruments > Current/Wind Meters > ANEMOMETERS", "In Situ/Laboratory Instruments > Pressure/Height Meters > BAROMETERS", "In Situ/Laboratory Instruments > Temperature/Humidity Sensors > Thermistors > THERMISTORS"], +// "type": "instrument", +// "namespace": "GCMD Keywords - Instruments" +// }, { +// "values": ["air_pressure_at_sea_level", "air_temperature", "dew_point_temperature", "relative_humidity", "sea_surface_temperature", "time", "visibility_in_air", "wind_from_direction", "wind_speed", "wind_speed_of_gust"], +// "type": "theme", +// "namespace": "NetCDF Climate and Forecast (CF) Metadata Convention Standard Name Table" +// }, { +// "values": ["air_temperature_sensor", "anemometer", "barometer", "ct_sensor", "humidity_sensor", "ocean_temperature_sensor", "visibility_sensor"], +// "type": "instrument", +// "namespace": "NOS SENSOR THESAURUS" +// }, { +// "values": ["1611400 - NWWH1", "1612340 - OOUH1", "1612480 - MOKH1", "1615680 - KLIH1", "1617433 - KWHH1", "1617760 - ILOH1", "1619910 - SNDP5", "1630000 - APRP7", "1631428 - PGBP7", "1770000 - NSTP6", "1820000 - KWJP8", "1890000 - WAKP8", "2695540 - BEPB6", "8311030 - OBGN6", "8311062 - ALXN6", "8410140 - PSBM1", "8411060 - CFWM1", "8413320 - ATGM1", "8418150 - CASM1", "8419317 - WELM1", "8443970 - BHBM3", "8447386 - FRVM3", "8447387 - BLTM3", "8447412 - FRXM3", "8447930 - BZBM3", "8449130 - NTKM3", "8452660 - NWPR1", "8452944 - CPTR1", "8452951 - PTCR1", "8454000 - FOXR1", "8454049 - QPTR1", "8461490 - NLNC3", "8465705 - NWHC3", "8467150 - BRHC3", "8510560 - MTKN6", "8516945 - KPTN6", "8518750 - BATN6", "8519483 - BGNN4", "8519532 - MHRN6", "8530973 - ROBN4", "8531680 - SDHN4", "8534720 - ACYN4", "8536110 - CMAN4", "8537121 - SJSN4", "8538886 - TPBN4", "8539094 - BDRN4", "8540433 - MRCP1", "8545240 - PHBP1", "8548989 - NBLP1", "8551762 - DELD1", "8551910 - RDYD1", "8557380 - LWSD1", "8570283 - OCIM2", "8571421 - BISM2", "8571892 - CAMM2", "8573364 - TCBM2", "8573927 - CHCM2", "8574680 - BLTM2", "8574728 - FSKM2", "8575512 - APAM2", "8577018 - COVM2", "8577330 - SLIM2", "8578240 - PPTM2", "8594900 - WASD2", "8631044 - WAHV2", "8632200 - KPTV2", "8632837 - RPLV2", "8635027 - NCDV2", "8635750 - LWTV2", "8637611 - YKRV2", "8637689 - YKTV2", "8638511 - DOMV2", "8638595 - CRYV2", "8638610 - SWPV2", "8638614 - WDSV2", "8638863 - CBBV2", "8638999 - CHYV2", "8639348 - MNPV2", "8651370 - DUKN7", "8652587 - ORIN7", "8654467 - HCGN7", "8656483 - BFTN7", "8658120 - WLON7", "8658163 - JMPN7", "8661070 - MROS1", "8665530 - CHTS1", "8670870 - FPKG1", "8720030 - FRDF1", "8720215 - NFDF1", "8720218 - MYPF1", "8720219 - DMSF1", "8720228 - LTJF1", "8720233 - BLIF1", "8720245 - JXUF1", "8720357 - BKBF1", "8720503 - GCVF1", "8721604 - TRDF1", "8722670 - LKWF1", "8723214 - VAKF1", "8723970 - VCAF1", "8724580 - KYWF1", "8725110 - NPSF1", "8725520 - FMRF1", "8726384 - PMAF1", "8726412 - MTBF1", "8726520 - SAPF1", "8726607 - OPTF1", "8726667 - MCYF1", "8726669 - ERTF1", "8726673 - SBLF1", "8726679 - TSHF1", "8726694 - TPAF1", "8726724 - CWBF1", "8727520 - CKYF1", "8728690 - APCF1", "8729108 - PACF1", "8729210 - PCBF1", "8729840 - PCLF1", "8732828 - WBYA1", "8734673 - FMOA1", "8735180 - DILA1", "8736163 - MBPA1", "8736897 - MCGA1", "8737005 - PTOA1", "8737048 - OBLA1", "8741003 - PTBM6", "8741041 - ULAM6", "8741094 - RARM6", "8741501 - DKCM6", "8741533 - PNLM6", "8747437 - WYCM6", "8760721 - PILL1", "8760922 - PSTL1", "8761305 - SHBL1", "8761724 - GISL1", "8761927 - NWCL1", "8761955 - CARL1", "8762482 - BYGL1", "8762484 - FREL1", "8764044 - TESL1", "8764227 - AMRL1", "8764314 - EINL1", "8766072 - FRWL1", "8767816 - LCLL1", "8767961 - BKTL1", "8768094 - CAPL1", "8770570 - SBPT2", "8770613 - MGPT2", "8770822 - TXPT2", "8771013 - EPTT2", "8771341 - GNJT2", "8771450 - GTOT2", "8772447 - FCGT2", "8774770 - RCPT2", "8775870 - MQTT2", "8779770 - PTIT2", "9014070 - AGCM4", "9014090 - MBRM4", "9014098 - FTGM4", "9052030 - OSGN6", "9052058 - RCRN6", "9063012 - NIAN6", "9063020 - BUFN6", "9063028 - PSTN6", "9063038 - EREP1", "9063053 - FAIO1", "9063063 - CNDO1", "9063079 - MRHO1", "9063085 - THRO1", "9075014 - HRBM4", "9075065 - LPNM4", "9075080 - MACM4", "9075099 - DTLM4", "9076024 - RCKM4", "9076027 - WNEM4", "9076033 - LTRM4", "9076070 - SWPM4", "9087023 - LDTM4", "9087031 - HLNM4", "9087044 - CMTI2", "9087069 - KWNW3", "9087088 - MNMM4", "9087096 - PNLM4", "9099004 - PTIM4", "9099018 - MCGM4", "9099064 - DULM5", "9099090 - GDMM5", "9410170 - SDBC1", "9410172 - IIWC1", "9410230 - LJAC1", "9410660 - OHBC1", "9410665 - PRJC1", "9410670 - PFXC1", "9410840 - ICAC1", "9411340 - NTBC1", "9411406 - HRVC1", "9412110 - PSLC1", "9413450 - MTYC1", "9414290 - FTPC1", "9414296 - PXSC1", "9414311 - PXOC1", "9414523 - RTYC1", "9414750 - AAMC1", "9414763 - LNDC1", "9414769 - OMHC1", "9414776 - OKXC1", "9414797 - OBXC1", "9414847 - PPXC1", "9414863 - RCMC1", "9415020 - PRYC1", "9415102 - MZXC1", "9415115 - PSBC1", "9415118 - UPBC1", "9415141 - DPXC1", "9415144 - PCOC1", "9416841 - ANVC1", "9418767 - HBYC1", "9419750 - CECC1", "9431647 - PORO3", "9432780 - CHAO3", "9435380 - SBEO3", "9437540 - TLBO3", "9439011 - HMDO3", "9439040 - ASTO3", "9440422 - LOPW1", "9440910 - TOKW1", "9441102 - WPTW1", "9442396 - LAPW1", "9443090 - NEAW1", "9444090 - PTAW1", "9444900 - PTWW1", "9446482 - TCMW1", "9446484 - TCNW1", "9447130 - EBSW1", "9449424 - CHYW1", "9449880 - FRDW1", "9450460 - KECA2", "9451054 - PLXA2", "9451600 - ITKA2", "9452210 - JNEA2", "9452400 - SKTA2", "9452634 - ELFA2", "9453220 - YATA2", "9454050 - CRVA2", "9454240 - VDZA2", "9455090 - SWLA2", "9455500 - OVIA2", "9455760 - NKTA2", "9455920 - ANTA2", "9457292 - KDAA2", "9457804 - ALIA2", "9459450 - SNDA2", "9459881 - KGCA2", "9461380 - ADKA2", "9461710 - ATKA2", "9462450 - OLSA2", "9462620 - UNLA2", "9463502 - PMOA2", "9464212 - VCVA2", "9468756 - NMTA2", "9491094 - RDDA2", "9497645 - PRDA2", "9751364 - CHSV3", "9751381 - LAMV3", "9751401 - LTBV3", "9751639 - CHAV3", "9752695 - ESPP4", "9755371 - SJNP4", "9759110 - MGIP4", "9759394 - MGZP4", "9759412 - AUDP4", "9759938 - MISP4", "9761115 - BARA9"], +// "type": "platform", +// "namespace": "NOS - NWSLI PLATFORM THESAURUS" +// }, { +// "values": ["CONTINENT > NORTH AMERICA > CANADA > GREAT LAKES, CANADA", "CONTINENT > NORTH AMERICA > UNITED STATES OF AMERICA > GREAT LAKES", "OCEAN > ARCTIC OCEAN > BEAUFORT SEA", "OCEAN > ATLANTIC OCEAN > NORTH ATLANTIC OCEAN", "OCEAN > ATLANTIC OCEAN > NORTH ATLANTIC OCEAN > BAY OF FUNDY", "OCEAN > ATLANTIC OCEAN > NORTH ATLANTIC OCEAN > CARIBBEAN SEA", "OCEAN > ATLANTIC OCEAN > NORTH ATLANTIC OCEAN > GULF OF MEXICO", "OCEAN > PACIFIC OCEAN > CENTRAL PACIFIC OCEAN > HAWAIIAN ISLANDS", "OCEAN > PACIFIC OCEAN > NORTH PACIFIC OCEAN", "OCEAN > PACIFIC OCEAN > NORTH PACIFIC OCEAN > BERING SEA", "OCEAN > PACIFIC OCEAN > NORTH PACIFIC OCEAN > GULF OF ALASKA", "OCEAN > PACIFIC OCEAN > SOUTH PACIFIC OCEAN"], +// "type": "place", +// "namespace": "GCMD Keywords - Locations" +// }], +// "topicCategories": ["environment", "oceans", "climatologyMeteorologyAtmosphere"], +// "temporalBounding": { +// "beginDate": "2013-03-01", +// "beginIndeterminate": null, +// "endDate": null, +// "endIndeterminate": "now", +// "instant": null, +// "instantIndeterminate": null, +// "description": null +// }, +// "spatialBounding": { +// "type": "MultiPolygon", +// "coordinates": [ +// [ +// [ +// [-180.0, -14.28], +// [-61.821, -14.28], +// [-61.821, 70.4], +// [-180.0, 70.4], +// [-180.0, -14.28] +// ] +// ], +// [ +// [ +// [144.657, -14.28], +// [180.0, -14.28], +// [180.0, 70.4], +// [144.657, 70.4], +// [144.657, -14.28] +// ] +// ] +// ] +// }, +// "isGlobal": false, +// "acquisitionInstruments": [], +// "acquisitionOperations": [], +// "acquisitionPlatforms": [], +// "dataFormats": [{ +// "name": "ORIGINATOR DATA FORMAT", +// "version": null +// }], +// "links": [{ +// "linkName": "Descriptive Information", +// "linkProtocol": "HTTP", +// "linkUrl": "http://data.nodc.noaa.gov/cgi-bin/iso?id=gov.noaa.nodc:NDBC-COOPS", +// "linkDescription": "Navigate directly to the URL for a descriptive web page with download links.", +// "linkFunction": "information" +// }, { +// "linkName": "Granule Search", +// "linkProtocol": "HTTP", +// "linkUrl": "http://www.nodc.noaa.gov/search/granule/rest/find/document?searchText=fileIdentifier%3ACO-OPS*&start=1&max=100&expandResults=true&f=searchPage", +// "linkDescription": "Granule Search", +// "linkFunction": "search" +// }, { +// "linkName": "THREDDS", +// "linkProtocol": "THREDDS", +// "linkUrl": "http://data.nodc.noaa.gov/thredds/catalog/ndbc/co-ops/", +// "linkDescription": "These data are available through a variety of services via a THREDDS (Thematic Real-time Environmental Distributed Data Services) Data Server (TDS). Depending on the dataset, the TDS can provide WMS, WCS, DAP, HTTP, and other data access and metadata services as well. For more information on the TDS, see http://www.unidata.ucar.edu/software/thredds/current/tds/.", +// "linkFunction": "download" +// }, { +// "linkName": "OPeNDAP", +// "linkProtocol": "DAP", +// "linkUrl": "http://data.nodc.noaa.gov/opendap/ndbc/co-ops/", +// "linkDescription": "These data are available through the Data Access Protocol (DAP) via an OPeNDAP Hyrax server. For a listing of OPeNDAP clients which may be used to access OPeNDAP-enabled data sets, please see the OPeNDAP website at http://opendap.org/.", +// "linkFunction": "download" +// }, { +// "linkName": "HTTP", +// "linkProtocol": "HTTP", +// "linkUrl": "http://data.nodc.noaa.gov/ndbc/co-ops/", +// "linkDescription": "Navigate directly to the URL for data access and direct download.", +// "linkFunction": "download" +// }, { +// "linkName": "FTP", +// "linkProtocol": "FTP", +// "linkUrl": "ftp://ftp.nodc.noaa.gov/pub/data.nodc/ndbc/co-ops/", +// "linkDescription": "These data are available through the File Transfer Protocol (FTP). You may use any FTP client to download these data.", +// "linkFunction": "download" +// }], +// "responsibleParties": [{ +// "individualName": null, +// "organizationName": "DOC/NOAA/NESDIS/NCEI > National Centers for Environmental Information, NESDIS, NOAA, U.S. Department of Commerce", +// "positionName": null, +// "role": "publisher", +// "email": "NODC.DataOfficer@noaa.gov", +// "phone": "301-713-3277" +// }, { +// "individualName": null, +// "organizationName": "DOC/NOAA/NESDIS/NODC > National Oceanographic Data Center, NESDIS, NOAA, U.S. Department of Commerce", +// "positionName": null, +// "role": "publisher", +// "email": "NODC.DataOfficer@noaa.gov", +// "phone": "301-713-3277" +// }, { +// "individualName": "Rex V Hervey", +// "organizationName": "US DOC; NOAA; NWS; National Data Buoy Center (NDBC)", +// "positionName": null, +// "role": "resourceProvider", +// "email": "rex.hervey@noaa.gov", +// "phone": "228-688-3007" +// }, { +// "individualName": null, +// "organizationName": "US DOC; NOAA; NWS; National Data Buoy Center (NDBC)", +// "positionName": null, +// "role": "resourceProvider", +// "email": null, +// "phone": null +// }, { +// "individualName": null, +// "organizationName": "DOC/NOAA/NESDIS/NCEI > National Centers for Environmental Information, NESDIS, NOAA, U.S. Department of Commerce", +// "positionName": null, +// "role": "pointOfContact", +// "email": "NCEI.Info@noaa.gov", +// "phone": "301-713-3277" +// }, { +// "individualName": null, +// "organizationName": "Global Change Data Center, Science and Exploration Directorate, Goddard Space Flight Center (GSFC) National Aeronautics and Space Administration (NASA)", +// "positionName": null, +// "role": "custodian", +// "email": null, +// "phone": null +// }], +// "thumbnail": "http://data.nodc.noaa.gov/cgi-bin/gfx?id=gov.noaa.nodc:NDBC-COOPS", +// "thumbnailDescription": "Preview graphic", +// "creationDate": null, +// "revisionDate": null, +// "publicationDate": "2013-06-05", +// "citeAsStatements": ["Cite as: Hervey, R. V. and US DOC; NOAA; NWS; National Data Buoy Center (2013). Coastal meteorological and water temperature data from National Water Level Observation Network (NWLON) and Physical Oceanographic Real-Time System (PORTS) stations of the NOAA Center for Operational Oceanographic Products and Services (CO-OPS). National Oceanographic Data Center, NOAA. Dataset. [access date]"], +// "crossReferences": [], +// "largerWorks": [], +// "useLimitation": "accessLevel: Public", +// "legalConstraints": ["Cite as: Hervey, R. V. and US DOC; NOAA; NWS; National Data Buoy Center (2013). Coastal meteorological and water temperature data from National Water Level Observation Network (NWLON) and Physical Oceanographic Real-Time System (PORTS) stations of the NOAA Center for Operational Oceanographic Products and Services (CO-OPS). National Oceanographic Data Center, NOAA. Dataset. [access date]", "NOAA and NCEI cannot provide any warranty as to the accuracy, reliability, or completeness of furnished data. Users assume responsibility to determine the usability of these data. The user is responsible for the results of any application of this data for other than its intended purpose."], +// "accessFeeStatement": null, +// "orderingInstructions": null, +// "edition": null, +// "dsmmAccessibility": 0, +// "dsmmDataIntegrity": 0, +// "dsmmDataQualityAssessment": 0, +// "dsmmDataQualityAssurance": 0, +// "dsmmDataQualityControlMonitoring": 0, +// "dsmmPreservability": 0, +// "dsmmProductionSustainability": 0, +// "dsmmTransparencyTraceability": 0, +// "dsmmUsability": 0, +// "dsmmAverage": 0.0, +// "updateFrequency": "asNeeded", +// "presentationForm": "tableDigital", +// "services": [] +// }, +// "analysis": { +// "identification": { +// "fileIdentifierExists": true, +// "fileIdentifierString": "gov.noaa.nodc:NDBC-COOPS", +// "doiExists": true, +// "doiString": "doi:10.5072/FK2TEST", +// "parentIdentifierExists": false, +// "parentIdentifierString": null, +// "hierarchyLevelNameExists": false, +// "isGranule": false +// }, +// "titles": { +// "titleExists": true, +// "titleCharacters": 244, +// "alternateTitleExists": false, +// "alternateTitleCharacters": 0, +// "titleFleschReadingEaseScore": -15.662258064516124, +// "alternateTitleFleschReadingEaseScore": null, +// "titleFleschKincaidReadingGradeLevel": 23.14516129032258, +// "alternateTitleFleschKincaidReadingGradeLevel": null +// }, +// "description": { +// "descriptionExists": true, +// "descriptionCharacters": 642, +// "descriptionFleschReadingEaseScore": 24.320808988764043, +// "descriptionFleschKincaidReadingGradeLevel": 14.289078651685397 +// }, +// "dataAccess": { +// "dataAccessExists": true +// }, +// "thumbnail": { +// "thumbnailExists": true +// }, +// "temporalBounding": { +// "beginDescriptor": "VALID", +// "beginPrecision": "Days", +// "beginIndexable": true, +// "beginZoneSpecified": null, +// "beginUtcDateTimeString": "2013-03-01T00:00:00Z", +// "beginYear": 2013, +// "beginDayOfYear": 60, +// "beginDayOfMonth": 1, +// "beginMonth": 3, +// "endDescriptor": "UNDEFINED", +// "endPrecision": null, +// "endIndexable": true, +// "endZoneSpecified": null, +// "endUtcDateTimeString": null, +// "endYear": null, +// "endDayOfYear": null, +// "endDayOfMonth": null, +// "endMonth": null, +// "instantDescriptor": "UNDEFINED", +// "instantPrecision": null, +// "instantIndexable": true, +// "instantZoneSpecified": null, +// "instantUtcDateTimeString": null, +// "instantYear": null, +// "instantDayOfYear": null, +// "instantDayOfMonth": null, +// "instantMonth": null, +// "rangeDescriptor": "ONGOING" +// }, +// "spatialBounding": { +// "spatialBoundingExists": true, +// "isValid": true, +// "validationError": null +// } +// }, +// "fileInformation": null, +// "fileLocations": {}, +// "publishing": { +// "isPrivate": false, +// "until": null +// }, +// "relationships": [], +// "errors": [] +// }""") +// def record = AvroUtils.mapToAvro((Map)jsonrecord, ParsedRecord) +// +// // println("zeb "+JsonOutput.toJson(parsed)) +// println("ZEB") +// println(record) +// def discovery = record.getDiscovery(); +// def analysis = record.getAnalysis(); +// def discoveryMap = AvroUtils.avroToMap(discovery, true); +// +// // prepare and apply fields that need to be reformatted for search +// discoveryMap.putAll(TransformationUtils.prepareGcmdKeyword(discovery)); +// discoveryMap.putAll(TransformationUtils.prepareDates(discovery.getTemporalBounding(), analysis.getTemporalBounding())); +// discoveryMap.put("dataFormat", TransformationUtils.prepareDataFormats(discovery)); +// discoveryMap.put("linkProtocol", TransformationUtils.prepareLinkProtocols(discovery)); +// discoveryMap.put("serviceLinks", TransformationUtils.prepareServiceLinks(discovery)); +// discoveryMap.put("serviceLinkProtocol", TransformationUtils.prepareServiceLinkProtocols(discovery)); +// discoveryMap.putAll(TransformationUtils.prepareResponsibleParties(record)); +// discoveryMap.put("internalParentIdentifier", TransformationUtils.prepareInternalParentIdentifier(record)); +// discoveryMap.put("filename", TransformationUtils.prepareFilename(record)); +// discoveryMap.put("checksums", TransformationUtils.prepareChecksums(record)); +// +// def pruned = TransformationUtils.pruneKnownUnmappedFields(discoveryMap, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) +// def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS)) +// // def indexedRecord = DataUtils.removeFromMap(pruned, minus) +// +// println(JsonOutput.toJson(pruned)) +// println(JsonOutput.toJson(minus)) +// then: +// pruned == [] +// minus == [ +// internalParentIdentifier: null, // ok for granule, not collection +// temporalBounding: [ +// fakeField: 123 +// ], +// errors: [ +// [ +// nonsense: "horrible", +// ] +// ], +// garbage:"nuke meeee" +// ] +// // +// // def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "errors" ] +// // indexedRecord.keySet().size() == expectedKeyset.size() +// // indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) +// // +// // indexedRecord.temporalBounding == [ +// // beginDescriptor: ValidDescriptor.VALID, +// // beginPrecision: ChronoUnit.DAYS.toString(), +// // beginIndexable: true, +// // beginZoneSpecified: null, +// // beginUtcDateTimeString: "2000-02-01", +// // endDescriptor: null, +// // endPrecision: null, +// // endIndexable: null, +// // endZoneSpecified: null, +// // endUtcDateTimeString: null, +// // instantDescriptor: null, +// // instantPrecision: null, +// // instantIndexable: null, +// // instantZoneSpecified: null, +// // instantUtcDateTimeString: null, +// // rangeDescriptor: null +// // ] +// // +// // indexedRecord.errors.size() == 1 +// // indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired +// // source: "valid field" +// // ] +// } def "clean up nested map before indexing strictly mapped fields for search (collection)"() { when: @@ -207,251 +646,251 @@ class TransformationUtilsSpec extends Specification { // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - def indexedRecord = TransformationUtils.reformatMessageForSearch(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS)) + def indexedRecord = TransformationUtils.reformatMessageForSearch(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet()) // def indexedRecord = DataUtils.removeFromMap(pruned, minus) then: + // + // def expectedKeyset = ["fileIdentifier", "parentIdentifier", "doi", "title", "description", "keywords", "topicCategories", "temporalBounding", "spatialBounding", "isGlobal", "acquisitionInstruments", "acquisitionOperations", "acquisitionPlatforms", "dataFormats", "links", "responsibleParties", "thumbnail", "citeAsStatements", "crossReferences", "largerWorks", "useLimitation", "legalConstraints", "accessFeeStatement", "orderingInstructions", "edition", "dsmmAverage", "services", "gcmdVerticalResolution", "gcmdDataCenters", "gcmdTemporalResolution", "gcmdLocations", "gcmdScience", "beginDate", "endDate", "endDayOfYear", "beginYear", "endMonth", "endYear", "endDayOfMonth", "dataFormat", "linkProtocol", "serviceLinks", "serviceLinkProtocol", "organizationNames", + // "individualNames", "checksums"] - def expectedKeyset = ["fileIdentifier", "parentIdentifier", "doi", "title", "description", "keywords", "topicCategories", "temporalBounding", "spatialBounding", "isGlobal", "acquisitionInstruments", "acquisitionOperations", "acquisitionPlatforms", "dataFormats", "links", "responsibleParties", "thumbnail", "citeAsStatements", "crossReferences", "largerWorks", "useLimitation", "legalConstraints", "accessFeeStatement", "orderingInstructions", "edition", "dsmmAverage", "services", "gcmdVerticalResolution", "gcmdDataCenters", "gcmdTemporalResolution", "gcmdLocations", "gcmdScience", "beginDate", "endDate", "endDayOfYear", "beginYear", "endMonth", "endYear", "endDayOfMonth", "dataFormat", "linkProtocol", "serviceLinks", "serviceLinkProtocol", "organizationNames", - "individualNames", "checksums"] - - - indexedRecord.keySet().size() == expectedKeyset.size() - expectedKeyset.each({ assert indexedRecord.keySet().contains(it) }) - indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) - - } - - def "clean up nested map before indexing strictly mapped fields for analysis and errors (granule)"() { // TODO change to use reformatMessageFor method - when: - def parsed = [ - identification: null, - internalParentIdentifier: null, - titles: null, - description: null, - dataAccess: null, - thumbnail: null, - temporalBounding: [ - beginDescriptor: ValidDescriptor.VALID, - beginPrecision: ChronoUnit.DAYS.toString(), - beginIndexable: true, - beginZoneSpecified: null, - beginUtcDateTimeString: "2000-02-01", - beginYear: 2000, - beginDayOfYear: 32, - beginDayOfMonth: 1, - beginMonth: 2, - endDescriptor: null, - endPrecision: null, - endIndexable: null, - endZoneSpecified: null, - endUtcDateTimeString: null, - endYear: null, - endDayOfYear: null, - endDayOfMonth: null, - endMonth: null, - instantDescriptor: null, - instantPrecision: null, - instantIndexable: null, - instantZoneSpecified: null, - instantUtcDateTimeString: null, - instantYear: null, - instantDayOfYear: null, - instantDayOfMonth: null, - instantMonth: null, - rangeDescriptor: null, - fakeField: 123 - ], - spatialBounding: null, - errors: [ - [ - nonsense: "horrible", - source: "valid field" - ] - ], - garbage:"nuke meeee" - ] - - // ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) - // .setAnalysis( - // Analysis.newBuilder().setTemporalBounding( - // TemporalBoundingAnalysis.newBuilder() - // .setBeginDescriptor(ValidDescriptor.VALID) - // .setBeginIndexable(true) - // .setBeginPrecision(ChronoUnit.DAYS.toString()) - // .setBeginZoneSpecified(null) - // .setBeginUtcDateTimeString("2000-02-01") - // .setBeginYear(2000) - // .setBeginMonth(2) - // .setBeginDayOfYear(32) - // .setBeginDayOfMonth(1) - // .build() - // ).build()).build() - - // def parsed = TransformationUtils.unfilteredAEMessage(record) - - def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) - def indexedRecord = DataUtils.removeFromMap(pruned, minus) - then: - minus == [ - temporalBounding: [ - fakeField: 123 - ], - errors: [ - [ - nonsense: "horrible", - ] - ], - garbage:"nuke meeee" - ] - - def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "internalParentIdentifier", "errors" ] - indexedRecord.keySet().size() == expectedKeyset.size() - indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) - - indexedRecord.temporalBounding == [ - beginDescriptor: ValidDescriptor.VALID, - beginPrecision: ChronoUnit.DAYS.toString(), - beginIndexable: true, - beginZoneSpecified: null, - beginUtcDateTimeString: "2000-02-01", - endDescriptor: null, - endPrecision: null, - endIndexable: null, - endZoneSpecified: null, - endUtcDateTimeString: null, - instantDescriptor: null, - instantPrecision: null, - instantIndexable: null, - instantZoneSpecified: null, - instantUtcDateTimeString: null, - rangeDescriptor: null - ] - - indexedRecord.errors.size() == 1 - indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired - source: "valid field" - ] + indexedRecord.keySet().size() == collectionFields.size() + collectionFields.each({ assert indexedRecord.keySet().contains(it) }) + indexedRecord.keySet().each({ assert collectionFields.contains(it) }) } - def "clean up nested map before indexing strictly mapped fields for analysis and errors (collection)"() { // TODO change to use reformatMessageFor method - when: - def parsed = [ - identification: null, - internalParentIdentifier: null, - titles: null, - description: null, - dataAccess: null, - thumbnail: null, - temporalBounding: [ - beginDescriptor: ValidDescriptor.VALID, - beginPrecision: ChronoUnit.DAYS.toString(), - beginIndexable: true, - beginZoneSpecified: null, - beginUtcDateTimeString: "2000-02-01", - beginYear: 2000, - beginDayOfYear: 32, - beginDayOfMonth: 1, - beginMonth: 2, - endDescriptor: null, - endPrecision: null, - endIndexable: null, - endZoneSpecified: null, - endUtcDateTimeString: null, - endYear: null, - endDayOfYear: null, - endDayOfMonth: null, - endMonth: null, - instantDescriptor: null, - instantPrecision: null, - instantIndexable: null, - instantZoneSpecified: null, - instantUtcDateTimeString: null, - instantYear: null, - instantDayOfYear: null, - instantDayOfMonth: null, - instantMonth: null, - rangeDescriptor: null, - fakeField: 123 - ], - spatialBounding: null, - errors: [ - [ - nonsense: "horrible", - source: "valid field" - ] - ], - garbage:"nuke meeee" - ] - - // ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) - // .setAnalysis( - // Analysis.newBuilder().setTemporalBounding( - // TemporalBoundingAnalysis.newBuilder() - // .setBeginDescriptor(ValidDescriptor.VALID) - // .setBeginIndexable(true) - // .setBeginPrecision(ChronoUnit.DAYS.toString()) - // .setBeginZoneSpecified(null) - // .setBeginUtcDateTimeString("2000-02-01") - // .setBeginYear(2000) - // .setBeginMonth(2) - // .setBeginDayOfYear(32) - // .setBeginDayOfMonth(1) - // .build() - // ).build()).build() - - // def parsed = TransformationUtils.unfilteredAEMessage(record) - - def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_ERROR_AND_ANALYSIS_INDEX_ALIAS)) - def indexedRecord = DataUtils.removeFromMap(pruned, minus) - - then: - minus == [ - internalParentIdentifier: null, // ok for granule, not collection - temporalBounding: [ - fakeField: 123 - ], - errors: [ - [ - nonsense: "horrible", - ] - ], - garbage:"nuke meeee" - ] - - def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "errors" ] - indexedRecord.keySet().size() == expectedKeyset.size() - indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) - - indexedRecord.temporalBounding == [ - beginDescriptor: ValidDescriptor.VALID, - beginPrecision: ChronoUnit.DAYS.toString(), - beginIndexable: true, - beginZoneSpecified: null, - beginUtcDateTimeString: "2000-02-01", - endDescriptor: null, - endPrecision: null, - endIndexable: null, - endZoneSpecified: null, - endUtcDateTimeString: null, - instantDescriptor: null, - instantPrecision: null, - instantIndexable: null, - instantZoneSpecified: null, - instantUtcDateTimeString: null, - rangeDescriptor: null - ] - - indexedRecord.errors.size() == 1 - indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired - source: "valid field" - ] + // def "clean up nested map before indexing strictly mapped fields for analysis and errors (granule)"() { // TODO change to use reformatMessageFor method + // when: + // def parsed = [ + // identification: null, + // internalParentIdentifier: null, + // titles: null, + // description: null, + // dataAccess: null, + // thumbnail: null, + // temporalBounding: [ + // beginDescriptor: ValidDescriptor.VALID, + // beginPrecision: ChronoUnit.DAYS.toString(), + // beginIndexable: true, + // beginZoneSpecified: null, + // beginUtcDateTimeString: "2000-02-01", + // beginYear: 2000, + // beginDayOfYear: 32, + // beginDayOfMonth: 1, + // beginMonth: 2, + // endDescriptor: null, + // endPrecision: null, + // endIndexable: null, + // endZoneSpecified: null, + // endUtcDateTimeString: null, + // endYear: null, + // endDayOfYear: null, + // endDayOfMonth: null, + // endMonth: null, + // instantDescriptor: null, + // instantPrecision: null, + // instantIndexable: null, + // instantZoneSpecified: null, + // instantUtcDateTimeString: null, + // instantYear: null, + // instantDayOfYear: null, + // instantDayOfMonth: null, + // instantMonth: null, + // rangeDescriptor: null, + // fakeField: 123 + // ], + // spatialBounding: null, + // errors: [ + // [ + // nonsense: "horrible", + // source: "valid field" + // ] + // ], + // garbage:"nuke meeee" + // ] + // + // // ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) + // // .setAnalysis( + // // Analysis.newBuilder().setTemporalBounding( + // // TemporalBoundingAnalysis.newBuilder() + // // .setBeginDescriptor(ValidDescriptor.VALID) + // // .setBeginIndexable(true) + // // .setBeginPrecision(ChronoUnit.DAYS.toString()) + // // .setBeginZoneSpecified(null) + // // .setBeginUtcDateTimeString("2000-02-01") + // // .setBeginYear(2000) + // // .setBeginMonth(2) + // // .setBeginDayOfYear(32) + // // .setBeginDayOfMonth(1) + // // .build() + // // ).build()).build() + // + // // def parsed = TransformationUtils.unfilteredAEMessage(record) + // + // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) + // def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) + // def indexedRecord = DataUtils.removeFromMap(pruned, minus) + // + // then: + // minus == [ + // temporalBounding: [ + // fakeField: 123 + // ], + // errors: [ + // [ + // nonsense: "horrible", + // ] + // ], + // garbage:"nuke meeee" + // ] + // + // def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "internalParentIdentifier", "errors" ] + // indexedRecord.keySet().size() == expectedKeyset.size() + // indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) + // + // indexedRecord.temporalBounding == [ + // beginDescriptor: ValidDescriptor.VALID, + // beginPrecision: ChronoUnit.DAYS.toString(), + // beginIndexable: true, + // beginZoneSpecified: null, + // beginUtcDateTimeString: "2000-02-01", + // endDescriptor: null, + // endPrecision: null, + // endIndexable: null, + // endZoneSpecified: null, + // endUtcDateTimeString: null, + // instantDescriptor: null, + // instantPrecision: null, + // instantIndexable: null, + // instantZoneSpecified: null, + // instantUtcDateTimeString: null, + // rangeDescriptor: null + // ] + // + // indexedRecord.errors.size() == 1 + // indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired + // source: "valid field" + // ] + // + // } - } + // def "clean up nested map before indexing strictly mapped fields for analysis and errors (collection)"() { // TODO change to use reformatMessageFor method + // when: + // def parsed = [ + // identification: null, + // internalParentIdentifier: null, + // titles: null, + // description: null, + // dataAccess: null, + // thumbnail: null, + // temporalBounding: [ + // beginDescriptor: ValidDescriptor.VALID, + // beginPrecision: ChronoUnit.DAYS.toString(), + // beginIndexable: true, + // beginZoneSpecified: null, + // beginUtcDateTimeString: "2000-02-01", + // beginYear: 2000, + // beginDayOfYear: 32, + // beginDayOfMonth: 1, + // beginMonth: 2, + // endDescriptor: null, + // endPrecision: null, + // endIndexable: null, + // endZoneSpecified: null, + // endUtcDateTimeString: null, + // endYear: null, + // endDayOfYear: null, + // endDayOfMonth: null, + // endMonth: null, + // instantDescriptor: null, + // instantPrecision: null, + // instantIndexable: null, + // instantZoneSpecified: null, + // instantUtcDateTimeString: null, + // instantYear: null, + // instantDayOfYear: null, + // instantDayOfMonth: null, + // instantMonth: null, + // rangeDescriptor: null, + // fakeField: 123 + // ], + // spatialBounding: null, + // errors: [ + // [ + // nonsense: "horrible", + // source: "valid field" + // ] + // ], + // garbage:"nuke meeee" + // ] + // + // // ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) + // // .setAnalysis( + // // Analysis.newBuilder().setTemporalBounding( + // // TemporalBoundingAnalysis.newBuilder() + // // .setBeginDescriptor(ValidDescriptor.VALID) + // // .setBeginIndexable(true) + // // .setBeginPrecision(ChronoUnit.DAYS.toString()) + // // .setBeginZoneSpecified(null) + // // .setBeginUtcDateTimeString("2000-02-01") + // // .setBeginYear(2000) + // // .setBeginMonth(2) + // // .setBeginDayOfYear(32) + // // .setBeginDayOfMonth(1) + // // .build() + // // ).build()).build() + // + // // def parsed = TransformationUtils.unfilteredAEMessage(record) + // + // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) + // def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_ERROR_AND_ANALYSIS_INDEX_ALIAS)) + // def indexedRecord = DataUtils.removeFromMap(pruned, minus) + // + // then: + // minus == [ + // internalParentIdentifier: null, // ok for granule, not collection + // temporalBounding: [ + // fakeField: 123 + // ], + // errors: [ + // [ + // nonsense: "horrible", + // ] + // ], + // garbage:"nuke meeee" + // ] + // + // def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "errors" ] + // indexedRecord.keySet().size() == expectedKeyset.size() + // indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) + // + // indexedRecord.temporalBounding == [ + // beginDescriptor: ValidDescriptor.VALID, + // beginPrecision: ChronoUnit.DAYS.toString(), + // beginIndexable: true, + // beginZoneSpecified: null, + // beginUtcDateTimeString: "2000-02-01", + // endDescriptor: null, + // endPrecision: null, + // endIndexable: null, + // endZoneSpecified: null, + // endUtcDateTimeString: null, + // instantDescriptor: null, + // instantPrecision: null, + // instantIndexable: null, + // instantZoneSpecified: null, + // instantUtcDateTimeString: null, + // rangeDescriptor: null + // ] + // + // indexedRecord.errors.size() == 1 + // indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired + // source: "valid field" + // ] + // + // } //////////////////////////////// // Identifiers, "Names" // From e74fa5adc18e8c6f616e4cd84f7aec9a97dec9b8 Mon Sep 17 00:00:00 2001 From: Zeb Date: Fri, 5 Jun 2020 10:38:41 -0600 Subject: [PATCH 16/29] Collapse into single method. --- .../onestop/indexer/util/IndexingUtils.java | 4 +- .../indexer/util/TransformationUtils.java | 46 +++++++++---------- .../util/TransformationUtilsSpec.groovy | 12 ++--- 3 files changed, 29 insertions(+), 33 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java index 01a9a14e9..aab650564 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java @@ -64,7 +64,7 @@ public static DocWriteRequest buildSearchWriteRequest(String indexName, DocWr } else { var formattedRecord = new HashMap(); - formattedRecord.putAll(TransformationUtils.reformatMessageForSearch(input.getValue().value(), input.getTargetSearchIndexFields())); + formattedRecord.putAll(TransformationUtils.reformatMessage(input.getValue().value(), input.getTargetSearchIndexFields())); formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); } @@ -76,7 +76,7 @@ public static DocWriteRequest buildAnalysisAndErrorWriteRequest(String indexN } else { var formattedRecord = new HashMap(); - formattedRecord.putAll(TransformationUtils.reformatMessageForAnalysisAndErrors(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); + formattedRecord.putAll(TransformationUtils.reformatMessage(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); } diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index bf0f17d47..e04e6f0d4 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -30,17 +30,20 @@ public class TransformationUtils { static final private Logger log = LoggerFactory.getLogger(TransformationUtils.class); /////////////////////////////////////////////////////////////////////////////// - // Indexing For Analysis & Errors // + // Convert to Indexing Message // /////////////////////////////////////////////////////////////////////////////// - public static Map reformatMessageForAnalysisAndErrors(ParsedRecord record, Set fields) { + public static Map reformatMessage(ParsedRecord record, Set fields) { + + var discovery = record.getDiscovery(); var analysis = record.getAnalysis(); var errors = record.getErrors(); - + var discoveryMap = AvroUtils.avroToMap(discovery, true); var analysisMap = AvroUtils.avroToMap(analysis, true); var message = new HashMap(); fields.forEach(field -> { message.put(field, analysisMap.get(field)); + message.put(field, discoveryMap.get(field)); }); if (fields.contains("internalParentIdentifier")) { analysisMap.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); @@ -49,26 +52,23 @@ public static Map reformatMessageForAnalysisAndErrors(ParsedReco .map(e -> AvroUtils.avroToMap(e)) .collect(Collectors.toList()); - message.put("errors", errorsList); - - return message; - } - - /////////////////////////////////////////////////////////////////////////////// - // Indexing For Search // - /////////////////////////////////////////////////////////////////////////////// - public static Map reformatMessageForSearch(ParsedRecord record, Set fields) { - var discovery = record.getDiscovery(); - var analysis = record.getAnalysis(); - var discoveryMap = AvroUtils.avroToMap(discovery, true); + if (fields.contains("errors")) { + message.put("errors", errorsList); + } - var message = new HashMap(); - fields.forEach(field -> { - message.put(field, discoveryMap.get(field)); - }); // prepare and apply fields that need to be reformatted for search - message.putAll(prepareGcmdKeyword(discovery));// TODO does this need and iff? - message.putAll(prepareDates(discovery.getTemporalBounding(), analysis.getTemporalBounding())); // TODO does this need and iff? + Map> gcmdKeywords = prepareGcmdKeyword(discovery); + gcmdKeywords.forEach((key, value) -> { + if (fields.contains(key)) { + message.put(key, value); + } + }); + Map dates = prepareDates(discovery.getTemporalBounding(), analysis.getTemporalBounding()); + dates.forEach((key, value) -> { + if (fields.contains(key)) { + message.put(key, value); + } + }); if (fields.contains("dataFormat")) { message.put("dataFormat", prepareDataFormats(discovery)); } @@ -96,11 +96,7 @@ public static Map reformatMessageForSearch(ParsedRecord record, } if (fields.contains("checksums")) { message.put("checksums", prepareChecksums(record)); - log.info("ZEB - including checksums (presumed granule)"); - } else { - log.info("ZEB - excluding checksums (presumed collection)"); } - return message; } diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index 83ad4575b..f456be6e5 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -86,7 +86,7 @@ class TransformationUtilsSpec extends Specification { /////////////////////////////// // def "only mapped #type fields are indexed"() { // when: - // def result = TransformationUtils.reformatMessageForSearch(record, fields) + // def result = TransformationUtils.reformatMessage(record, fields) // // then: // result.keySet().each({ assert fields.keySet().contains(it) }) // TODO this is a shallow only check! @@ -103,7 +103,7 @@ class TransformationUtilsSpec extends Specification { ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) .setFileInformation(FileInformation.newBuilder().setChecksums([Checksum.newBuilder().setAlgorithm(ChecksumAlgorithm.MD5).setValue('abc').build()]).build()).build() - def indexedRecord = TransformationUtils.reformatMessageForSearch(record, fields) + def indexedRecord = TransformationUtils.reformatMessage(record, fields) then: @@ -183,7 +183,7 @@ class TransformationUtilsSpec extends Specification { // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - def indexedRecord = TransformationUtils.reformatMessageForSearch(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet()) + def indexedRecord = TransformationUtils.reformatMessage(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet()) // def indexedRecord = DataUtils.removeFromMap(pruned, minus) then: @@ -646,7 +646,7 @@ class TransformationUtilsSpec extends Specification { // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - def indexedRecord = TransformationUtils.reformatMessageForSearch(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet()) + def indexedRecord = TransformationUtils.reformatMessage(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet()) // def indexedRecord = DataUtils.removeFromMap(pruned, minus) then: @@ -1043,7 +1043,7 @@ class TransformationUtilsSpec extends Specification { def "party names are not included in granule search info"() { when: def record = TestUtils.inputGranuleRecord // <-- granule! - def result = TransformationUtils.reformatMessageForSearch(record, collectionFields) // <-- top level reformat method! + def result = TransformationUtils.reformatMessage(record, collectionFields) // <-- top level reformat method! then: result.individualNames == [] as Set @@ -1160,7 +1160,7 @@ class TransformationUtilsSpec extends Specification { def "accession values are not included"() { when: - def result = TransformationUtils.reformatMessageForSearch(TestUtils.inputAvroRecord, collectionFields) + def result = TransformationUtils.reformatMessage(TestUtils.inputAvroRecord, collectionFields) then: result.accessionValues == null From 05658860b31c4624394010744a4143c6b34f4f14 Mon Sep 17 00:00:00 2001 From: Zeb Date: Fri, 5 Jun 2020 12:09:08 -0600 Subject: [PATCH 17/29] Clean up tests. --- .../indexer/util/TransformationUtils.java | 25 + .../util/TransformationUtilsSpec.groovy | 833 +----------------- 2 files changed, 71 insertions(+), 787 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index e04e6f0d4..dce5ac3b2 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -69,6 +69,9 @@ public static Map reformatMessage(ParsedRecord record, Set> prepareResponsibleParties(ParsedRecord r //////////////////////////// // Dates // //////////////////////////// + + private static Map prepareTemporalBounding(TemporalBoundingAnalysis analysis) { + var result = new HashMap(); + result.put("beginDescriptor", analysis.getBeginDescriptor()); + result.put("beginIndexable", analysis.getBeginIndexable()); + result.put("beginPrecision", analysis.getBeginPrecision()); + result.put("beginUtcDateTimeString", analysis.getBeginUtcDateTimeString()); + result.put("beginZoneSpecified", analysis.getBeginZoneSpecified()); + result.put("endDescriptor", analysis.getEndDescriptor()); + result.put("endIndexable", analysis.getEndIndexable()); + result.put("endPrecision", analysis.getEndPrecision()); + result.put("endUtcDateTimeString", analysis.getEndUtcDateTimeString()); + result.put("endZoneSpecified", analysis.getEndZoneSpecified()); + result.put("instantDescriptor", analysis.getInstantDescriptor()); + result.put("instantIndexable", analysis.getInstantIndexable()); + result.put("instantPrecision", analysis.getInstantPrecision()); + result.put("instantUtcDateTimeString", analysis.getInstantUtcDateTimeString()); + result.put("instantZoneSpecified", analysis.getInstantZoneSpecified()); + result.put("rangeDescriptor", analysis.getRangeDescriptor()); + return result; + } + private static Map prepareDates(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { String beginDate, endDate; Long year; diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index f456be6e5..d3da590de 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -29,9 +29,10 @@ import org.cedar.onestop.kafka.common.util.DataUtils; @Unroll class TransformationUtilsSpec extends Specification { - static Set collectionFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet() - static Set granuleFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet() + static Set collectionSearchFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet() + static Set granuleSearchFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet() static Set granuleAnalysisErrorFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS).keySet() + static Set collectionAnalysisErrorFields = TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_ERROR_AND_ANALYSIS_INDEX_ALIAS).keySet() static expectedKeywords = [ "SIO > Super Important Organization", @@ -93,805 +94,63 @@ class TransformationUtilsSpec extends Specification { // // where: // type | fields | record - // 'collection' | collectionFields | TestUtils.inputCollectionRecord - // 'granule' | granuleFields | TestUtils.inputGranuleRecord + // 'collection' | collectionSearchFields | TestUtils.inputCollectionRecord + // 'granule' | granuleSearchFields | TestUtils.inputGranuleRecord // } - def "why is it complaining about checksums #label"() { + def "reformatMessage populates with correct fields for #label"() { when: ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) - .setFileInformation(FileInformation.newBuilder().setChecksums([Checksum.newBuilder().setAlgorithm(ChecksumAlgorithm.MD5).setValue('abc').build()]).build()).build() + .setFileInformation( + FileInformation.newBuilder() + .setChecksums( + [ + Checksum.newBuilder() + .setAlgorithm(ChecksumAlgorithm.MD5) + .setValue('abc') + .build() + ] + ).build() + ) + .setAnalysis( + Analysis.newBuilder().setTemporalBounding( + TemporalBoundingAnalysis.newBuilder() + .setBeginDescriptor(ValidDescriptor.VALID) + .setBeginIndexable(true) + .setBeginPrecision(ChronoUnit.DAYS.toString()) + .setBeginZoneSpecified(null) + .setBeginUtcDateTimeString("2000-02-01") + .setBeginYear(2000) + .setBeginMonth(2) + .setBeginDayOfYear(32) + .setBeginDayOfMonth(1) + .build() + ).build() + ) + .build() def indexedRecord = TransformationUtils.reformatMessage(record, fields) then: + println(label) + println(JsonOutput.toJson(AvroUtils.avroToMap(record.getAnalysis(), true))) + println(JsonOutput.toJson(indexedRecord)) indexedRecord.keySet().contains("checksums") == shouldIncludeChecksums + indexedRecord.keySet().contains("internalParentIdentifier") == shouldIncludeParentIdentifier + (indexedRecord.keySet().contains("temporalBounding") && indexedRecord.get("temporalBounding").keySet().contains("beginMonth")) == false + (indexedRecord.keySet().contains("temporalBounding") && indexedRecord.get("temporalBounding").keySet().contains("beginIndexable")) == shouldIncludeTemporalAnalysis where: - label | shouldIncludeChecksums | fields - 'collections' | false | TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet() - 'granules' | true | TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet() + label | fields | shouldIncludeChecksums | shouldIncludeTemporalAnalysis | shouldIncludeParentIdentifier + 'search collections' | collectionSearchFields | false | false | false + 'search granules' | granuleSearchFields | true | false | true + 'analysis and errors collections' | collectionAnalysisErrorFields | false | true | false + 'analysis and errors granules' | granuleAnalysisErrorFields | false | true | true } - def "clean up nested map before indexing strictly mapped fields for search (granule)"() { - when: - // def parsed = [ - // identification: null, - // titles: null, - // description: null, - // dataAccess: null, - // thumbnail: null, - // temporalBounding: [ - // beginDescriptor: ValidDescriptor.VALID, - // beginPrecision: ChronoUnit.DAYS.toString(), - // beginIndexable: true, - // beginZoneSpecified: null, - // beginUtcDateTimeString: "2000-02-01", - // beginYear: 2000, - // beginDayOfYear: 32, - // beginDayOfMonth: 1, - // beginMonth: 2, - // endDescriptor: null, - // endPrecision: null, - // endIndexable: null, - // endZoneSpecified: null, - // endUtcDateTimeString: null, - // endYear: null, - // endDayOfYear: null, - // endDayOfMonth: null, - // endMonth: null, - // instantDescriptor: null, - // instantPrecision: null, - // instantIndexable: null, - // instantZoneSpecified: null, - // instantUtcDateTimeString: null, - // instantYear: null, - // instantDayOfYear: null, - // instantDayOfMonth: null, - // instantMonth: null, - // rangeDescriptor: null, - // fakeField: 123 - // ], - // spatialBounding: null, - // internalParentIdentifier: null, - // errors: [ - // [ - // nonsense: "horrible", - // source: "valid field" - // ] - // ], - // garbage:"nuke meeee" - // ] - ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) - .setAnalysis( - Analysis.newBuilder().setTemporalBounding( - TemporalBoundingAnalysis.newBuilder() - .setBeginDescriptor(ValidDescriptor.VALID) - .setBeginIndexable(true) - .setBeginPrecision(ChronoUnit.DAYS.toString()) - .setBeginZoneSpecified(null) - .setBeginUtcDateTimeString("2000-02-01") - .setBeginYear(2000) - .setBeginMonth(2) - .setBeginDayOfYear(32) - .setBeginDayOfMonth(1) - .build() - ).build()).build() - - - // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - def indexedRecord = TransformationUtils.reformatMessage(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_SEARCH_INDEX_ALIAS).keySet()) - // def indexedRecord = DataUtils.removeFromMap(pruned, minus) - - then: - // minus == [ - // temporalBounding: [ - // fakeField: 123 - // ], - // errors: [ - // [ - // nonsense: "horrible", - // ] - // ], - // garbage:"nuke meeee" - // ] - - // def expectedKeyset = ["fileIdentifier", "parentIdentifier", "doi", "title", "description", "keywords", "topicCategories", "temporalBounding", "spatialBounding", "isGlobal", "acquisitionInstruments", "acquisitionOperations", "acquisitionPlatforms", "dataFormats", "links", "responsibleParties", "thumbnail", "citeAsStatements", "crossReferences", "largerWorks", "legalConstraints", "services", "gcmdVerticalResolution", "gcmdDataCenters", "gcmdTemporalResolution", "gcmdLocations", "gcmdScience", "beginDate", "endDate", "endDayOfYear", "beginYear", "endMonth", "endYear", "endDayOfMonth", "dataFormat", "linkProtocol", "serviceLinks", "serviceLinkProtocol", "internalParentIdentifier", "filename", "checksums"] - - - indexedRecord.keySet().size() == granuleFields.size() - indexedRecord.keySet().each({ assert granuleFields.contains(it) }) - - } - - // def "prune fields - spatial"() { - // when: - // def mapWithSpatial = [ - // spatialBounding: [ - // type: "MultiPolygon", - // coordinates: [ - // [ - // [ - // [-180.0, -14.28], - // [-61.821, -14.28], - // [-61.821, 70.4], - // [-180.0, 70.4], - // [-180.0, -14.28] - // ] - // ], - // [ - // [ - // [144.657, -14.28], - // [180.0, -14.28], - // [180.0, 70.4], - // [144.657, 70.4], - // [144.657, -14.28] - // ] - // ] - // ] - // ] - // ] - // def minus = TransformationUtils.identifyUnmappedFields(mapWithSpatial, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS)) - // - // then: - // minus == [] - // } - -// def "debug integration" () { -// when: -// def jsonrecord = (new JsonSlurper()).parseText("""{ -// "type": "collection", -// "discovery": { -// "fileIdentifier": "gov.noaa.nodc:NDBC-COOPS", -// "parentIdentifier": null, -// "hierarchyLevelName": null, -// "doi": "doi:10.5072/FK2TEST", -// "purpose": "Basic research", -// "status": "completed", -// "credit": null, -// "title": "Coastal meteorological and water temperature data from National Water Level Observation Network (NWLON) and Physical Oceanographic Real-Time System (PORTS) stations of the NOAA Center for Operational Oceanographic Products and Services (CO-OPS)", -// "alternateTitle": null, -// "description": "The National Water Level Observation Network (NWLON) is a network of long-term water level stations operated and maintained by CO-OPS. NWLON stations are located on shore-based platforms, and primarily collect real-time water level measurements. As of January 2013, approximately 180 of 210 NWLON stations also collect real-time meteorological data. About 20 CO-OPS Physical Oceanographic Real-Time Systems (PORTS) comprise a group of water level stations, and 65 of these stations also collect real-time meteorological data. Data parameters include barometric pressure, wind direction, speed and gust, air temperature, and water temperature.", -// "keywords": [{ -// "values": ["DOC/NOAA/NESDIS/NODC > National Oceanographic Data Center, NESDIS, NOAA, U.S. Department of Commerce", "DOC/NOAA/NESDIS/NCEI > National Centers for Environmental Information, NESDIS, NOAA, U.S. Department of Commerce"], -// "type": "dataCenter", -// "namespace": "GCMD Keywords - Data Centers" -// }, { -// "values": ["0107939", "0108059", "0109292", "0111163", "0112393", "0113250", "0113898", "0114473", "0115274", "0115910", "0116703", "0117348", "0117811", "0118682", "0120725", "0120726", "0122183", "0122220", "0123085", "0123363", "0124305", "0125493", "0126410", "0126781", "0127407", "0128443", "0129526", "0130004", "0131097", "0131931", "0137308", "0138303", "0139574", "0141136", "0144301", "0145770", "0148198", "0151779", "0154391", "0155989"], -// "type": null, -// "namespace": "NCEI ACCESSION NUMBER" -// }, { -// "values": ["AIR TEMPERATURE", "BAROMETRIC PRESSURE", "DEWPOINT", "RELATIVE HUMIDITY", "SEA SURFACE TEMPERATURE", "VISIBILITY", "WIND DIRECTION", "WIND GUST", "WIND SPEED"], -// "type": "theme", -// "namespace": "NODC DATA TYPES THESAURUS" -// }, { -// "values": ["anemometer", "barometers", "meteorological sensors", "thermistor"], -// "type": "instrument", -// "namespace": "NODC INSTRUMENT TYPES THESAURUS" -// }, { -// "values": ["meteorological", "physical"], -// "type": "theme", -// "namespace": "NODC OBSERVATION TYPES THESAURUS" -// }, { -// "values": ["FIXED PLATFORM"], -// "type": "platform", -// "namespace": "NODC PLATFORM NAMES THESAURUS" -// }, { -// "values": ["US DOC; NOAA; NOS; Center for Operational Oceanographic Products and Services"], -// "type": "dataCenter", -// "namespace": "NODC COLLECTING INSTITUTION NAMES THESAURUS" -// }, { -// "values": ["US DOC; NOAA; NWS; National Data Buoy Center"], -// "type": "dataCenter", -// "namespace": "NODC SUBMITTING INSTITUTION NAMES THESAURUS" -// }, { -// "values": ["National Water Level Observation Network (NWLON)", "Physical Oceanographic Real-Time System (PORTS)"], -// "type": "project", -// "namespace": "NODC PROJECT NAMES THESAURUS" -// }, { -// "values": ["Bay of Fundy", "Beaufort Sea", "Bering Sea", "Caribbean Sea", "Coastal waters of Alabama", "Coastal Waters of Florida", "Coastal Waters of Louisiana", "Coastal Waters of Mississippi", "Coastal Waters of Southeast Alaska and British Columbia", "Coastal Waters of Texas", "Florida Keys National Marine Sanctuary", "Great Lakes", "Gulf of Alaska", "Gulf of Mexico", "Kaneohe Bay", "Monterey Bay National Marine Sanctuary", "North Atlantic Ocean", "North Pacific Ocean", "Papahanaumokuakea Marine National Monument", "Philippine Sea", "San Diego Bay", "South Pacific Ocean", "Yaquina Bay"], -// "type": "place", -// "namespace": "NODC SEA AREA NAMES THESAURUS" -// }, { -// "values": ["oceanography"], -// "type": "theme", -// "namespace": "WMO_CategoryCode" -// }, { -// "values": ["GOVERNMENT AGENCIES-U.S. FEDERAL AGENCIES > DOC > NOAA > DOC/NOAA/NOS/CO-OPS > Center for Operational Oceanographic Products and Services, National Ocean Service, NOAA, U.S. Department of Commerce > http://tidesandcurrents.noaa.gov/", "GOVERNMENT AGENCIES-U.S. FEDERAL AGENCIES > DOC > NOAA > DOC/NOAA/NWS/NDBC > National Data Buoy Center, National Weather Service, NOAA, U.S. Department of Commerce > http://www.ndbc.noaa.gov/"], -// "type": "dataCenter", -// "namespace": "GCMD Keywords - Data Centers" -// }, { -// "values": ["EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC PRESSURE", "EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC TEMPERATURE", "EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC TEMPERATURE > SURFACE TEMPERATURE > DEW POINT TEMPERATURE", "EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WATER VAPOR > HUMIDITY", "EARTH SCIENCE > ATMOSPHERE > ATMOSPHERIC WINDS > SURFACE WINDS > WIND SPEED/WIND DIRECTION", "EARTH SCIENCE > OCEANS > OCEAN OPTICS", "EARTH SCIENCE > OCEANS > OCEAN TEMPERATURE > SEA SURFACE TEMPERATURE"], -// "type": "theme", -// "namespace": "GCMD Keywords - Science Keywords" -// }, { -// "values": ["In Situ/Laboratory Instruments > Current/Wind Meters > ANEMOMETERS", "In Situ/Laboratory Instruments > Pressure/Height Meters > BAROMETERS", "In Situ/Laboratory Instruments > Temperature/Humidity Sensors > Thermistors > THERMISTORS"], -// "type": "instrument", -// "namespace": "GCMD Keywords - Instruments" -// }, { -// "values": ["air_pressure_at_sea_level", "air_temperature", "dew_point_temperature", "relative_humidity", "sea_surface_temperature", "time", "visibility_in_air", "wind_from_direction", "wind_speed", "wind_speed_of_gust"], -// "type": "theme", -// "namespace": "NetCDF Climate and Forecast (CF) Metadata Convention Standard Name Table" -// }, { -// "values": ["air_temperature_sensor", "anemometer", "barometer", "ct_sensor", "humidity_sensor", "ocean_temperature_sensor", "visibility_sensor"], -// "type": "instrument", -// "namespace": "NOS SENSOR THESAURUS" -// }, { -// "values": ["1611400 - NWWH1", "1612340 - OOUH1", "1612480 - MOKH1", "1615680 - KLIH1", "1617433 - KWHH1", "1617760 - ILOH1", "1619910 - SNDP5", "1630000 - APRP7", "1631428 - PGBP7", "1770000 - NSTP6", "1820000 - KWJP8", "1890000 - WAKP8", "2695540 - BEPB6", "8311030 - OBGN6", "8311062 - ALXN6", "8410140 - PSBM1", "8411060 - CFWM1", "8413320 - ATGM1", "8418150 - CASM1", "8419317 - WELM1", "8443970 - BHBM3", "8447386 - FRVM3", "8447387 - BLTM3", "8447412 - FRXM3", "8447930 - BZBM3", "8449130 - NTKM3", "8452660 - NWPR1", "8452944 - CPTR1", "8452951 - PTCR1", "8454000 - FOXR1", "8454049 - QPTR1", "8461490 - NLNC3", "8465705 - NWHC3", "8467150 - BRHC3", "8510560 - MTKN6", "8516945 - KPTN6", "8518750 - BATN6", "8519483 - BGNN4", "8519532 - MHRN6", "8530973 - ROBN4", "8531680 - SDHN4", "8534720 - ACYN4", "8536110 - CMAN4", "8537121 - SJSN4", "8538886 - TPBN4", "8539094 - BDRN4", "8540433 - MRCP1", "8545240 - PHBP1", "8548989 - NBLP1", "8551762 - DELD1", "8551910 - RDYD1", "8557380 - LWSD1", "8570283 - OCIM2", "8571421 - BISM2", "8571892 - CAMM2", "8573364 - TCBM2", "8573927 - CHCM2", "8574680 - BLTM2", "8574728 - FSKM2", "8575512 - APAM2", "8577018 - COVM2", "8577330 - SLIM2", "8578240 - PPTM2", "8594900 - WASD2", "8631044 - WAHV2", "8632200 - KPTV2", "8632837 - RPLV2", "8635027 - NCDV2", "8635750 - LWTV2", "8637611 - YKRV2", "8637689 - YKTV2", "8638511 - DOMV2", "8638595 - CRYV2", "8638610 - SWPV2", "8638614 - WDSV2", "8638863 - CBBV2", "8638999 - CHYV2", "8639348 - MNPV2", "8651370 - DUKN7", "8652587 - ORIN7", "8654467 - HCGN7", "8656483 - BFTN7", "8658120 - WLON7", "8658163 - JMPN7", "8661070 - MROS1", "8665530 - CHTS1", "8670870 - FPKG1", "8720030 - FRDF1", "8720215 - NFDF1", "8720218 - MYPF1", "8720219 - DMSF1", "8720228 - LTJF1", "8720233 - BLIF1", "8720245 - JXUF1", "8720357 - BKBF1", "8720503 - GCVF1", "8721604 - TRDF1", "8722670 - LKWF1", "8723214 - VAKF1", "8723970 - VCAF1", "8724580 - KYWF1", "8725110 - NPSF1", "8725520 - FMRF1", "8726384 - PMAF1", "8726412 - MTBF1", "8726520 - SAPF1", "8726607 - OPTF1", "8726667 - MCYF1", "8726669 - ERTF1", "8726673 - SBLF1", "8726679 - TSHF1", "8726694 - TPAF1", "8726724 - CWBF1", "8727520 - CKYF1", "8728690 - APCF1", "8729108 - PACF1", "8729210 - PCBF1", "8729840 - PCLF1", "8732828 - WBYA1", "8734673 - FMOA1", "8735180 - DILA1", "8736163 - MBPA1", "8736897 - MCGA1", "8737005 - PTOA1", "8737048 - OBLA1", "8741003 - PTBM6", "8741041 - ULAM6", "8741094 - RARM6", "8741501 - DKCM6", "8741533 - PNLM6", "8747437 - WYCM6", "8760721 - PILL1", "8760922 - PSTL1", "8761305 - SHBL1", "8761724 - GISL1", "8761927 - NWCL1", "8761955 - CARL1", "8762482 - BYGL1", "8762484 - FREL1", "8764044 - TESL1", "8764227 - AMRL1", "8764314 - EINL1", "8766072 - FRWL1", "8767816 - LCLL1", "8767961 - BKTL1", "8768094 - CAPL1", "8770570 - SBPT2", "8770613 - MGPT2", "8770822 - TXPT2", "8771013 - EPTT2", "8771341 - GNJT2", "8771450 - GTOT2", "8772447 - FCGT2", "8774770 - RCPT2", "8775870 - MQTT2", "8779770 - PTIT2", "9014070 - AGCM4", "9014090 - MBRM4", "9014098 - FTGM4", "9052030 - OSGN6", "9052058 - RCRN6", "9063012 - NIAN6", "9063020 - BUFN6", "9063028 - PSTN6", "9063038 - EREP1", "9063053 - FAIO1", "9063063 - CNDO1", "9063079 - MRHO1", "9063085 - THRO1", "9075014 - HRBM4", "9075065 - LPNM4", "9075080 - MACM4", "9075099 - DTLM4", "9076024 - RCKM4", "9076027 - WNEM4", "9076033 - LTRM4", "9076070 - SWPM4", "9087023 - LDTM4", "9087031 - HLNM4", "9087044 - CMTI2", "9087069 - KWNW3", "9087088 - MNMM4", "9087096 - PNLM4", "9099004 - PTIM4", "9099018 - MCGM4", "9099064 - DULM5", "9099090 - GDMM5", "9410170 - SDBC1", "9410172 - IIWC1", "9410230 - LJAC1", "9410660 - OHBC1", "9410665 - PRJC1", "9410670 - PFXC1", "9410840 - ICAC1", "9411340 - NTBC1", "9411406 - HRVC1", "9412110 - PSLC1", "9413450 - MTYC1", "9414290 - FTPC1", "9414296 - PXSC1", "9414311 - PXOC1", "9414523 - RTYC1", "9414750 - AAMC1", "9414763 - LNDC1", "9414769 - OMHC1", "9414776 - OKXC1", "9414797 - OBXC1", "9414847 - PPXC1", "9414863 - RCMC1", "9415020 - PRYC1", "9415102 - MZXC1", "9415115 - PSBC1", "9415118 - UPBC1", "9415141 - DPXC1", "9415144 - PCOC1", "9416841 - ANVC1", "9418767 - HBYC1", "9419750 - CECC1", "9431647 - PORO3", "9432780 - CHAO3", "9435380 - SBEO3", "9437540 - TLBO3", "9439011 - HMDO3", "9439040 - ASTO3", "9440422 - LOPW1", "9440910 - TOKW1", "9441102 - WPTW1", "9442396 - LAPW1", "9443090 - NEAW1", "9444090 - PTAW1", "9444900 - PTWW1", "9446482 - TCMW1", "9446484 - TCNW1", "9447130 - EBSW1", "9449424 - CHYW1", "9449880 - FRDW1", "9450460 - KECA2", "9451054 - PLXA2", "9451600 - ITKA2", "9452210 - JNEA2", "9452400 - SKTA2", "9452634 - ELFA2", "9453220 - YATA2", "9454050 - CRVA2", "9454240 - VDZA2", "9455090 - SWLA2", "9455500 - OVIA2", "9455760 - NKTA2", "9455920 - ANTA2", "9457292 - KDAA2", "9457804 - ALIA2", "9459450 - SNDA2", "9459881 - KGCA2", "9461380 - ADKA2", "9461710 - ATKA2", "9462450 - OLSA2", "9462620 - UNLA2", "9463502 - PMOA2", "9464212 - VCVA2", "9468756 - NMTA2", "9491094 - RDDA2", "9497645 - PRDA2", "9751364 - CHSV3", "9751381 - LAMV3", "9751401 - LTBV3", "9751639 - CHAV3", "9752695 - ESPP4", "9755371 - SJNP4", "9759110 - MGIP4", "9759394 - MGZP4", "9759412 - AUDP4", "9759938 - MISP4", "9761115 - BARA9"], -// "type": "platform", -// "namespace": "NOS - NWSLI PLATFORM THESAURUS" -// }, { -// "values": ["CONTINENT > NORTH AMERICA > CANADA > GREAT LAKES, CANADA", "CONTINENT > NORTH AMERICA > UNITED STATES OF AMERICA > GREAT LAKES", "OCEAN > ARCTIC OCEAN > BEAUFORT SEA", "OCEAN > ATLANTIC OCEAN > NORTH ATLANTIC OCEAN", "OCEAN > ATLANTIC OCEAN > NORTH ATLANTIC OCEAN > BAY OF FUNDY", "OCEAN > ATLANTIC OCEAN > NORTH ATLANTIC OCEAN > CARIBBEAN SEA", "OCEAN > ATLANTIC OCEAN > NORTH ATLANTIC OCEAN > GULF OF MEXICO", "OCEAN > PACIFIC OCEAN > CENTRAL PACIFIC OCEAN > HAWAIIAN ISLANDS", "OCEAN > PACIFIC OCEAN > NORTH PACIFIC OCEAN", "OCEAN > PACIFIC OCEAN > NORTH PACIFIC OCEAN > BERING SEA", "OCEAN > PACIFIC OCEAN > NORTH PACIFIC OCEAN > GULF OF ALASKA", "OCEAN > PACIFIC OCEAN > SOUTH PACIFIC OCEAN"], -// "type": "place", -// "namespace": "GCMD Keywords - Locations" -// }], -// "topicCategories": ["environment", "oceans", "climatologyMeteorologyAtmosphere"], -// "temporalBounding": { -// "beginDate": "2013-03-01", -// "beginIndeterminate": null, -// "endDate": null, -// "endIndeterminate": "now", -// "instant": null, -// "instantIndeterminate": null, -// "description": null -// }, -// "spatialBounding": { -// "type": "MultiPolygon", -// "coordinates": [ -// [ -// [ -// [-180.0, -14.28], -// [-61.821, -14.28], -// [-61.821, 70.4], -// [-180.0, 70.4], -// [-180.0, -14.28] -// ] -// ], -// [ -// [ -// [144.657, -14.28], -// [180.0, -14.28], -// [180.0, 70.4], -// [144.657, 70.4], -// [144.657, -14.28] -// ] -// ] -// ] -// }, -// "isGlobal": false, -// "acquisitionInstruments": [], -// "acquisitionOperations": [], -// "acquisitionPlatforms": [], -// "dataFormats": [{ -// "name": "ORIGINATOR DATA FORMAT", -// "version": null -// }], -// "links": [{ -// "linkName": "Descriptive Information", -// "linkProtocol": "HTTP", -// "linkUrl": "http://data.nodc.noaa.gov/cgi-bin/iso?id=gov.noaa.nodc:NDBC-COOPS", -// "linkDescription": "Navigate directly to the URL for a descriptive web page with download links.", -// "linkFunction": "information" -// }, { -// "linkName": "Granule Search", -// "linkProtocol": "HTTP", -// "linkUrl": "http://www.nodc.noaa.gov/search/granule/rest/find/document?searchText=fileIdentifier%3ACO-OPS*&start=1&max=100&expandResults=true&f=searchPage", -// "linkDescription": "Granule Search", -// "linkFunction": "search" -// }, { -// "linkName": "THREDDS", -// "linkProtocol": "THREDDS", -// "linkUrl": "http://data.nodc.noaa.gov/thredds/catalog/ndbc/co-ops/", -// "linkDescription": "These data are available through a variety of services via a THREDDS (Thematic Real-time Environmental Distributed Data Services) Data Server (TDS). Depending on the dataset, the TDS can provide WMS, WCS, DAP, HTTP, and other data access and metadata services as well. For more information on the TDS, see http://www.unidata.ucar.edu/software/thredds/current/tds/.", -// "linkFunction": "download" -// }, { -// "linkName": "OPeNDAP", -// "linkProtocol": "DAP", -// "linkUrl": "http://data.nodc.noaa.gov/opendap/ndbc/co-ops/", -// "linkDescription": "These data are available through the Data Access Protocol (DAP) via an OPeNDAP Hyrax server. For a listing of OPeNDAP clients which may be used to access OPeNDAP-enabled data sets, please see the OPeNDAP website at http://opendap.org/.", -// "linkFunction": "download" -// }, { -// "linkName": "HTTP", -// "linkProtocol": "HTTP", -// "linkUrl": "http://data.nodc.noaa.gov/ndbc/co-ops/", -// "linkDescription": "Navigate directly to the URL for data access and direct download.", -// "linkFunction": "download" -// }, { -// "linkName": "FTP", -// "linkProtocol": "FTP", -// "linkUrl": "ftp://ftp.nodc.noaa.gov/pub/data.nodc/ndbc/co-ops/", -// "linkDescription": "These data are available through the File Transfer Protocol (FTP). You may use any FTP client to download these data.", -// "linkFunction": "download" -// }], -// "responsibleParties": [{ -// "individualName": null, -// "organizationName": "DOC/NOAA/NESDIS/NCEI > National Centers for Environmental Information, NESDIS, NOAA, U.S. Department of Commerce", -// "positionName": null, -// "role": "publisher", -// "email": "NODC.DataOfficer@noaa.gov", -// "phone": "301-713-3277" -// }, { -// "individualName": null, -// "organizationName": "DOC/NOAA/NESDIS/NODC > National Oceanographic Data Center, NESDIS, NOAA, U.S. Department of Commerce", -// "positionName": null, -// "role": "publisher", -// "email": "NODC.DataOfficer@noaa.gov", -// "phone": "301-713-3277" -// }, { -// "individualName": "Rex V Hervey", -// "organizationName": "US DOC; NOAA; NWS; National Data Buoy Center (NDBC)", -// "positionName": null, -// "role": "resourceProvider", -// "email": "rex.hervey@noaa.gov", -// "phone": "228-688-3007" -// }, { -// "individualName": null, -// "organizationName": "US DOC; NOAA; NWS; National Data Buoy Center (NDBC)", -// "positionName": null, -// "role": "resourceProvider", -// "email": null, -// "phone": null -// }, { -// "individualName": null, -// "organizationName": "DOC/NOAA/NESDIS/NCEI > National Centers for Environmental Information, NESDIS, NOAA, U.S. Department of Commerce", -// "positionName": null, -// "role": "pointOfContact", -// "email": "NCEI.Info@noaa.gov", -// "phone": "301-713-3277" -// }, { -// "individualName": null, -// "organizationName": "Global Change Data Center, Science and Exploration Directorate, Goddard Space Flight Center (GSFC) National Aeronautics and Space Administration (NASA)", -// "positionName": null, -// "role": "custodian", -// "email": null, -// "phone": null -// }], -// "thumbnail": "http://data.nodc.noaa.gov/cgi-bin/gfx?id=gov.noaa.nodc:NDBC-COOPS", -// "thumbnailDescription": "Preview graphic", -// "creationDate": null, -// "revisionDate": null, -// "publicationDate": "2013-06-05", -// "citeAsStatements": ["Cite as: Hervey, R. V. and US DOC; NOAA; NWS; National Data Buoy Center (2013). Coastal meteorological and water temperature data from National Water Level Observation Network (NWLON) and Physical Oceanographic Real-Time System (PORTS) stations of the NOAA Center for Operational Oceanographic Products and Services (CO-OPS). National Oceanographic Data Center, NOAA. Dataset. [access date]"], -// "crossReferences": [], -// "largerWorks": [], -// "useLimitation": "accessLevel: Public", -// "legalConstraints": ["Cite as: Hervey, R. V. and US DOC; NOAA; NWS; National Data Buoy Center (2013). Coastal meteorological and water temperature data from National Water Level Observation Network (NWLON) and Physical Oceanographic Real-Time System (PORTS) stations of the NOAA Center for Operational Oceanographic Products and Services (CO-OPS). National Oceanographic Data Center, NOAA. Dataset. [access date]", "NOAA and NCEI cannot provide any warranty as to the accuracy, reliability, or completeness of furnished data. Users assume responsibility to determine the usability of these data. The user is responsible for the results of any application of this data for other than its intended purpose."], -// "accessFeeStatement": null, -// "orderingInstructions": null, -// "edition": null, -// "dsmmAccessibility": 0, -// "dsmmDataIntegrity": 0, -// "dsmmDataQualityAssessment": 0, -// "dsmmDataQualityAssurance": 0, -// "dsmmDataQualityControlMonitoring": 0, -// "dsmmPreservability": 0, -// "dsmmProductionSustainability": 0, -// "dsmmTransparencyTraceability": 0, -// "dsmmUsability": 0, -// "dsmmAverage": 0.0, -// "updateFrequency": "asNeeded", -// "presentationForm": "tableDigital", -// "services": [] -// }, -// "analysis": { -// "identification": { -// "fileIdentifierExists": true, -// "fileIdentifierString": "gov.noaa.nodc:NDBC-COOPS", -// "doiExists": true, -// "doiString": "doi:10.5072/FK2TEST", -// "parentIdentifierExists": false, -// "parentIdentifierString": null, -// "hierarchyLevelNameExists": false, -// "isGranule": false -// }, -// "titles": { -// "titleExists": true, -// "titleCharacters": 244, -// "alternateTitleExists": false, -// "alternateTitleCharacters": 0, -// "titleFleschReadingEaseScore": -15.662258064516124, -// "alternateTitleFleschReadingEaseScore": null, -// "titleFleschKincaidReadingGradeLevel": 23.14516129032258, -// "alternateTitleFleschKincaidReadingGradeLevel": null -// }, -// "description": { -// "descriptionExists": true, -// "descriptionCharacters": 642, -// "descriptionFleschReadingEaseScore": 24.320808988764043, -// "descriptionFleschKincaidReadingGradeLevel": 14.289078651685397 -// }, -// "dataAccess": { -// "dataAccessExists": true -// }, -// "thumbnail": { -// "thumbnailExists": true -// }, -// "temporalBounding": { -// "beginDescriptor": "VALID", -// "beginPrecision": "Days", -// "beginIndexable": true, -// "beginZoneSpecified": null, -// "beginUtcDateTimeString": "2013-03-01T00:00:00Z", -// "beginYear": 2013, -// "beginDayOfYear": 60, -// "beginDayOfMonth": 1, -// "beginMonth": 3, -// "endDescriptor": "UNDEFINED", -// "endPrecision": null, -// "endIndexable": true, -// "endZoneSpecified": null, -// "endUtcDateTimeString": null, -// "endYear": null, -// "endDayOfYear": null, -// "endDayOfMonth": null, -// "endMonth": null, -// "instantDescriptor": "UNDEFINED", -// "instantPrecision": null, -// "instantIndexable": true, -// "instantZoneSpecified": null, -// "instantUtcDateTimeString": null, -// "instantYear": null, -// "instantDayOfYear": null, -// "instantDayOfMonth": null, -// "instantMonth": null, -// "rangeDescriptor": "ONGOING" -// }, -// "spatialBounding": { -// "spatialBoundingExists": true, -// "isValid": true, -// "validationError": null -// } -// }, -// "fileInformation": null, -// "fileLocations": {}, -// "publishing": { -// "isPrivate": false, -// "until": null -// }, -// "relationships": [], -// "errors": [] -// }""") -// def record = AvroUtils.mapToAvro((Map)jsonrecord, ParsedRecord) -// -// // println("zeb "+JsonOutput.toJson(parsed)) -// println("ZEB") -// println(record) -// def discovery = record.getDiscovery(); -// def analysis = record.getAnalysis(); -// def discoveryMap = AvroUtils.avroToMap(discovery, true); -// -// // prepare and apply fields that need to be reformatted for search -// discoveryMap.putAll(TransformationUtils.prepareGcmdKeyword(discovery)); -// discoveryMap.putAll(TransformationUtils.prepareDates(discovery.getTemporalBounding(), analysis.getTemporalBounding())); -// discoveryMap.put("dataFormat", TransformationUtils.prepareDataFormats(discovery)); -// discoveryMap.put("linkProtocol", TransformationUtils.prepareLinkProtocols(discovery)); -// discoveryMap.put("serviceLinks", TransformationUtils.prepareServiceLinks(discovery)); -// discoveryMap.put("serviceLinkProtocol", TransformationUtils.prepareServiceLinkProtocols(discovery)); -// discoveryMap.putAll(TransformationUtils.prepareResponsibleParties(record)); -// discoveryMap.put("internalParentIdentifier", TransformationUtils.prepareInternalParentIdentifier(record)); -// discoveryMap.put("filename", TransformationUtils.prepareFilename(record)); -// discoveryMap.put("checksums", TransformationUtils.prepareChecksums(record)); -// -// def pruned = TransformationUtils.pruneKnownUnmappedFields(discoveryMap, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) -// def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS)) -// // def indexedRecord = DataUtils.removeFromMap(pruned, minus) -// -// println(JsonOutput.toJson(pruned)) -// println(JsonOutput.toJson(minus)) -// then: -// pruned == [] -// minus == [ -// internalParentIdentifier: null, // ok for granule, not collection -// temporalBounding: [ -// fakeField: 123 -// ], -// errors: [ -// [ -// nonsense: "horrible", -// ] -// ], -// garbage:"nuke meeee" -// ] -// // -// // def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "errors" ] -// // indexedRecord.keySet().size() == expectedKeyset.size() -// // indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) -// // -// // indexedRecord.temporalBounding == [ -// // beginDescriptor: ValidDescriptor.VALID, -// // beginPrecision: ChronoUnit.DAYS.toString(), -// // beginIndexable: true, -// // beginZoneSpecified: null, -// // beginUtcDateTimeString: "2000-02-01", -// // endDescriptor: null, -// // endPrecision: null, -// // endIndexable: null, -// // endZoneSpecified: null, -// // endUtcDateTimeString: null, -// // instantDescriptor: null, -// // instantPrecision: null, -// // instantIndexable: null, -// // instantZoneSpecified: null, -// // instantUtcDateTimeString: null, -// // rangeDescriptor: null -// // ] -// // -// // indexedRecord.errors.size() == 1 -// // indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired -// // source: "valid field" -// // ] -// } - - def "clean up nested map before indexing strictly mapped fields for search (collection)"() { - when: - - ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) - .setAnalysis( - Analysis.newBuilder().setTemporalBounding( - TemporalBoundingAnalysis.newBuilder() - .setBeginDescriptor(ValidDescriptor.VALID) - .setBeginIndexable(true) - .setBeginPrecision(ChronoUnit.DAYS.toString()) - .setBeginZoneSpecified(null) - .setBeginUtcDateTimeString("2000-02-01") - .setBeginYear(2000) - .setBeginMonth(2) - .setBeginDayOfYear(32) - .setBeginDayOfMonth(1) - .build() - ).build()).build() - - - // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - def indexedRecord = TransformationUtils.reformatMessage(record, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_SEARCH_INDEX_ALIAS).keySet()) - // def indexedRecord = DataUtils.removeFromMap(pruned, minus) - - then: - // - // def expectedKeyset = ["fileIdentifier", "parentIdentifier", "doi", "title", "description", "keywords", "topicCategories", "temporalBounding", "spatialBounding", "isGlobal", "acquisitionInstruments", "acquisitionOperations", "acquisitionPlatforms", "dataFormats", "links", "responsibleParties", "thumbnail", "citeAsStatements", "crossReferences", "largerWorks", "useLimitation", "legalConstraints", "accessFeeStatement", "orderingInstructions", "edition", "dsmmAverage", "services", "gcmdVerticalResolution", "gcmdDataCenters", "gcmdTemporalResolution", "gcmdLocations", "gcmdScience", "beginDate", "endDate", "endDayOfYear", "beginYear", "endMonth", "endYear", "endDayOfMonth", "dataFormat", "linkProtocol", "serviceLinks", "serviceLinkProtocol", "organizationNames", - // "individualNames", "checksums"] - - - indexedRecord.keySet().size() == collectionFields.size() - collectionFields.each({ assert indexedRecord.keySet().contains(it) }) - indexedRecord.keySet().each({ assert collectionFields.contains(it) }) - - } - - // def "clean up nested map before indexing strictly mapped fields for analysis and errors (granule)"() { // TODO change to use reformatMessageFor method - // when: - // def parsed = [ - // identification: null, - // internalParentIdentifier: null, - // titles: null, - // description: null, - // dataAccess: null, - // thumbnail: null, - // temporalBounding: [ - // beginDescriptor: ValidDescriptor.VALID, - // beginPrecision: ChronoUnit.DAYS.toString(), - // beginIndexable: true, - // beginZoneSpecified: null, - // beginUtcDateTimeString: "2000-02-01", - // beginYear: 2000, - // beginDayOfYear: 32, - // beginDayOfMonth: 1, - // beginMonth: 2, - // endDescriptor: null, - // endPrecision: null, - // endIndexable: null, - // endZoneSpecified: null, - // endUtcDateTimeString: null, - // endYear: null, - // endDayOfYear: null, - // endDayOfMonth: null, - // endMonth: null, - // instantDescriptor: null, - // instantPrecision: null, - // instantIndexable: null, - // instantZoneSpecified: null, - // instantUtcDateTimeString: null, - // instantYear: null, - // instantDayOfYear: null, - // instantDayOfMonth: null, - // instantMonth: null, - // rangeDescriptor: null, - // fakeField: 123 - // ], - // spatialBounding: null, - // errors: [ - // [ - // nonsense: "horrible", - // source: "valid field" - // ] - // ], - // garbage:"nuke meeee" - // ] - // - // // ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) - // // .setAnalysis( - // // Analysis.newBuilder().setTemporalBounding( - // // TemporalBoundingAnalysis.newBuilder() - // // .setBeginDescriptor(ValidDescriptor.VALID) - // // .setBeginIndexable(true) - // // .setBeginPrecision(ChronoUnit.DAYS.toString()) - // // .setBeginZoneSpecified(null) - // // .setBeginUtcDateTimeString("2000-02-01") - // // .setBeginYear(2000) - // // .setBeginMonth(2) - // // .setBeginDayOfYear(32) - // // .setBeginDayOfMonth(1) - // // .build() - // // ).build()).build() - // - // // def parsed = TransformationUtils.unfilteredAEMessage(record) - // - // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - // def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.GRANULE_ERROR_AND_ANALYSIS_INDEX_ALIAS)) - // def indexedRecord = DataUtils.removeFromMap(pruned, minus) - // - // then: - // minus == [ - // temporalBounding: [ - // fakeField: 123 - // ], - // errors: [ - // [ - // nonsense: "horrible", - // ] - // ], - // garbage:"nuke meeee" - // ] - // - // def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "internalParentIdentifier", "errors" ] - // indexedRecord.keySet().size() == expectedKeyset.size() - // indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) - // - // indexedRecord.temporalBounding == [ - // beginDescriptor: ValidDescriptor.VALID, - // beginPrecision: ChronoUnit.DAYS.toString(), - // beginIndexable: true, - // beginZoneSpecified: null, - // beginUtcDateTimeString: "2000-02-01", - // endDescriptor: null, - // endPrecision: null, - // endIndexable: null, - // endZoneSpecified: null, - // endUtcDateTimeString: null, - // instantDescriptor: null, - // instantPrecision: null, - // instantIndexable: null, - // instantZoneSpecified: null, - // instantUtcDateTimeString: null, - // rangeDescriptor: null - // ] - // - // indexedRecord.errors.size() == 1 - // indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired - // source: "valid field" - // ] - // - // } - - // def "clean up nested map before indexing strictly mapped fields for analysis and errors (collection)"() { // TODO change to use reformatMessageFor method - // when: - // def parsed = [ - // identification: null, - // internalParentIdentifier: null, - // titles: null, - // description: null, - // dataAccess: null, - // thumbnail: null, - // temporalBounding: [ - // beginDescriptor: ValidDescriptor.VALID, - // beginPrecision: ChronoUnit.DAYS.toString(), - // beginIndexable: true, - // beginZoneSpecified: null, - // beginUtcDateTimeString: "2000-02-01", - // beginYear: 2000, - // beginDayOfYear: 32, - // beginDayOfMonth: 1, - // beginMonth: 2, - // endDescriptor: null, - // endPrecision: null, - // endIndexable: null, - // endZoneSpecified: null, - // endUtcDateTimeString: null, - // endYear: null, - // endDayOfYear: null, - // endDayOfMonth: null, - // endMonth: null, - // instantDescriptor: null, - // instantPrecision: null, - // instantIndexable: null, - // instantZoneSpecified: null, - // instantUtcDateTimeString: null, - // instantYear: null, - // instantDayOfYear: null, - // instantDayOfMonth: null, - // instantMonth: null, - // rangeDescriptor: null, - // fakeField: 123 - // ], - // spatialBounding: null, - // errors: [ - // [ - // nonsense: "horrible", - // source: "valid field" - // ] - // ], - // garbage:"nuke meeee" - // ] - // - // // ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) - // // .setAnalysis( - // // Analysis.newBuilder().setTemporalBounding( - // // TemporalBoundingAnalysis.newBuilder() - // // .setBeginDescriptor(ValidDescriptor.VALID) - // // .setBeginIndexable(true) - // // .setBeginPrecision(ChronoUnit.DAYS.toString()) - // // .setBeginZoneSpecified(null) - // // .setBeginUtcDateTimeString("2000-02-01") - // // .setBeginYear(2000) - // // .setBeginMonth(2) - // // .setBeginDayOfYear(32) - // // .setBeginDayOfMonth(1) - // // .build() - // // ).build()).build() - // - // // def parsed = TransformationUtils.unfilteredAEMessage(record) - // - // def pruned = TransformationUtils.pruneKnownUnmappedFields(parsed, IndexingInput.getUnmappedAnalysisAndErrorsIndexFields()) - // def minus = TransformationUtils.identifyUnmappedFields(pruned, TestUtils.esConfig.indexedProperties(TestUtils.esConfig.COLLECTION_ERROR_AND_ANALYSIS_INDEX_ALIAS)) - // def indexedRecord = DataUtils.removeFromMap(pruned, minus) - // - // then: - // minus == [ - // internalParentIdentifier: null, // ok for granule, not collection - // temporalBounding: [ - // fakeField: 123 - // ], - // errors: [ - // [ - // nonsense: "horrible", - // ] - // ], - // garbage:"nuke meeee" - // ] - // - // def expectedKeyset = ["identification", "titles", "description", "dataAccess", "thumbnail", "temporalBounding", "spatialBounding", "errors" ] - // indexedRecord.keySet().size() == expectedKeyset.size() - // indexedRecord.keySet().each({ assert expectedKeyset.contains(it) }) - // - // indexedRecord.temporalBounding == [ - // beginDescriptor: ValidDescriptor.VALID, - // beginPrecision: ChronoUnit.DAYS.toString(), - // beginIndexable: true, - // beginZoneSpecified: null, - // beginUtcDateTimeString: "2000-02-01", - // endDescriptor: null, - // endPrecision: null, - // endIndexable: null, - // endZoneSpecified: null, - // endUtcDateTimeString: null, - // instantDescriptor: null, - // instantPrecision: null, - // instantIndexable: null, - // instantZoneSpecified: null, - // instantUtcDateTimeString: null, - // rangeDescriptor: null - // ] - // - // indexedRecord.errors.size() == 1 - // indexedRecord.errors[0] == [nonsense:"horrible", // FIXME this is not actually desired - // source: "valid field" - // ] - // - // } - //////////////////////////////// // Identifiers, "Names" // //////////////////////////////// @@ -1043,7 +302,7 @@ class TransformationUtilsSpec extends Specification { def "party names are not included in granule search info"() { when: def record = TestUtils.inputGranuleRecord // <-- granule! - def result = TransformationUtils.reformatMessage(record, collectionFields) // <-- top level reformat method! + def result = TransformationUtils.reformatMessage(record, collectionSearchFields) // <-- top level reformat method! then: result.individualNames == [] as Set @@ -1160,7 +419,7 @@ class TransformationUtilsSpec extends Specification { def "accession values are not included"() { when: - def result = TransformationUtils.reformatMessage(TestUtils.inputAvroRecord, collectionFields) + def result = TransformationUtils.reformatMessage(TestUtils.inputAvroRecord, collectionSearchFields) then: result.accessionValues == null From 6025f02fbf502def12700abf23a4f852cc2d231e Mon Sep 17 00:00:00 2001 From: Zeb Date: Tue, 9 Jun 2020 14:28:45 -0600 Subject: [PATCH 18/29] Minor cleanup --- .../indexer/util/TransformationUtilsSpec.groovy | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index d3da590de..971741271 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -85,18 +85,6 @@ class TransformationUtilsSpec extends Specification { /////////////////////////////// // Generic Indexed Fields // /////////////////////////////// - // def "only mapped #type fields are indexed"() { - // when: - // def result = TransformationUtils.reformatMessage(record, fields) - // - // then: - // result.keySet().each({ assert fields.keySet().contains(it) }) // TODO this is a shallow only check! - // - // where: - // type | fields | record - // 'collection' | collectionSearchFields | TestUtils.inputCollectionRecord - // 'granule' | granuleSearchFields | TestUtils.inputGranuleRecord - // } def "reformatMessage populates with correct fields for #label"() { when: From f49e8a0815b3cf76d45bb6690326434d991d61bc Mon Sep 17 00:00:00 2001 From: Zeb Date: Mon, 15 Jun 2020 11:43:57 -0600 Subject: [PATCH 19/29] Undo combining reformatMessage functions. --- .../onestop/indexer/util/IndexingUtils.java | 4 +- .../indexer/util/TransformationUtils.java | 55 +++++++++++++++-- .../util/TransformationUtilsSpec.groovy | 61 +++++++++++++++++-- 3 files changed, 108 insertions(+), 12 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java index aab650564..fbb1a5594 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java @@ -64,7 +64,7 @@ public static DocWriteRequest buildSearchWriteRequest(String indexName, DocWr } else { var formattedRecord = new HashMap(); - formattedRecord.putAll(TransformationUtils.reformatMessage(input.getValue().value(), input.getTargetSearchIndexFields())); + formattedRecord.putAll(TransformationUtils.reformatMessageForSearch(input.getValue().value(), input.getTargetSearchIndexFields())); formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); } @@ -76,7 +76,7 @@ public static DocWriteRequest buildAnalysisAndErrorWriteRequest(String indexN } else { var formattedRecord = new HashMap(); - formattedRecord.putAll(TransformationUtils.reformatMessage(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); + formattedRecord.putAll(TransformationUtils.reformatMessageForAnalysis(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); } diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index dce5ac3b2..9f395fa2e 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -32,18 +32,15 @@ public class TransformationUtils { /////////////////////////////////////////////////////////////////////////////// // Convert to Indexing Message // /////////////////////////////////////////////////////////////////////////////// - public static Map reformatMessage(ParsedRecord record, Set fields) { + public static Map reformatMessageForAnalysis(ParsedRecord record, Set fields) { - var discovery = record.getDiscovery(); var analysis = record.getAnalysis(); var errors = record.getErrors(); - var discoveryMap = AvroUtils.avroToMap(discovery, true); var analysisMap = AvroUtils.avroToMap(analysis, true); var message = new HashMap(); fields.forEach(field -> { message.put(field, analysisMap.get(field)); - message.put(field, discoveryMap.get(field)); }); if (fields.contains("internalParentIdentifier")) { analysisMap.put("internalParentIdentifier", prepareInternalParentIdentifier(record)); @@ -56,6 +53,56 @@ public static Map reformatMessage(ParsedRecord record, Set prepareIdentification(IdentificationAnalysis identification) { + var result = new HashMap(); + var analysis = AvroUtils.avroToMap(identification); // TODO using map because I need javadocs on the IdentificationAnalysis object... + + if (analysis == null) { + return result; + } + result.put("doiExists", analysis.get("doiExists")); + result.put("doiString", analysis.get("doiString")); + result.put("fileIdentifierExists", analysis.get("fileIdentifierExists")); + result.put("fileIdentifierString", analysis.get("fileIdentifierString")); + result.put("hierarchyLevelNameExists", analysis.get("hierarchyLevelNameExists")); + result.put("isGranule", analysis.get("isGranule")); + result.put("parentIdentifierExists", analysis.get("parentIdentifierExists")); + // if ((Boolean)analysis.get("isGranule")) { FIXME + // result.put("parentIdentifierString", analysis.get("parentIdentifierString")); + // } + return result; + } + + public static Map reformatMessageForSearch(ParsedRecord record, Set fields) { + + var discovery = record.getDiscovery(); + var analysis = record.getAnalysis(); + var errors = record.getErrors(); + var discoveryMap = AvroUtils.avroToMap(discovery, true); + var analysisMap = AvroUtils.avroToMap(analysis, true); + var message = new HashMap(); + + fields.forEach(field -> { + message.put(field, discoveryMap.get(field)); + }); + var errorsList = errors.stream() + .map(e -> AvroUtils.avroToMap(e)) + .collect(Collectors.toList()); + + if (fields.contains("errors")) { + message.put("errors", errorsList); + } + // prepare and apply fields that need to be reformatted for search Map> gcmdKeywords = prepareGcmdKeyword(discovery); gcmdKeywords.forEach((key, value) -> { diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index 971741271..74eec7759 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -86,7 +86,7 @@ class TransformationUtilsSpec extends Specification { // Generic Indexed Fields // /////////////////////////////// - def "reformatMessage populates with correct fields for #label"() { + def "reformatMessageForAnalysis populates with correct fields for #label"() { when: ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) @@ -118,7 +118,7 @@ class TransformationUtilsSpec extends Specification { ) .build() - def indexedRecord = TransformationUtils.reformatMessage(record, fields) + def indexedRecord = TransformationUtils.reformatMessageForAnalysis(record, fields) then: @@ -132,13 +132,62 @@ class TransformationUtilsSpec extends Specification { where: label | fields | shouldIncludeChecksums | shouldIncludeTemporalAnalysis | shouldIncludeParentIdentifier - 'search collections' | collectionSearchFields | false | false | false - 'search granules' | granuleSearchFields | true | false | true 'analysis and errors collections' | collectionAnalysisErrorFields | false | true | false 'analysis and errors granules' | granuleAnalysisErrorFields | false | true | true } + + def "reformatMessageForSearch populates with correct fields for #label"() { + when: + + ParsedRecord record = ParsedRecord.newBuilder(TestUtils.inputAvroRecord) + .setFileInformation( + FileInformation.newBuilder() + .setChecksums( + [ + Checksum.newBuilder() + .setAlgorithm(ChecksumAlgorithm.MD5) + .setValue('abc') + .build() + ] + ).build() + ) + .setAnalysis( + Analysis.newBuilder().setTemporalBounding( + TemporalBoundingAnalysis.newBuilder() + .setBeginDescriptor(ValidDescriptor.VALID) + .setBeginIndexable(true) + .setBeginPrecision(ChronoUnit.DAYS.toString()) + .setBeginZoneSpecified(null) + .setBeginUtcDateTimeString("2000-02-01") + .setBeginYear(2000) + .setBeginMonth(2) + .setBeginDayOfYear(32) + .setBeginDayOfMonth(1) + .build() + ).build() + ) + .build() + + def indexedRecord = TransformationUtils.reformatMessageForSearch(record, fields) + + then: + + println(label) + println(JsonOutput.toJson(AvroUtils.avroToMap(record.getAnalysis(), true))) + println(JsonOutput.toJson(indexedRecord)) + indexedRecord.keySet().contains("checksums") == shouldIncludeChecksums + indexedRecord.keySet().contains("internalParentIdentifier") == shouldIncludeParentIdentifier + (indexedRecord.keySet().contains("temporalBounding") && indexedRecord.get("temporalBounding").keySet().contains("beginMonth")) == false + (indexedRecord.keySet().contains("temporalBounding") && indexedRecord.get("temporalBounding").keySet().contains("beginIndexable")) == shouldIncludeTemporalAnalysis + + where: + label | fields | shouldIncludeChecksums | shouldIncludeTemporalAnalysis | shouldIncludeParentIdentifier + 'search collections' | collectionSearchFields | false | false | false + 'search granules' | granuleSearchFields | true | false | true + } + //////////////////////////////// // Identifiers, "Names" // //////////////////////////////// @@ -290,7 +339,7 @@ class TransformationUtilsSpec extends Specification { def "party names are not included in granule search info"() { when: def record = TestUtils.inputGranuleRecord // <-- granule! - def result = TransformationUtils.reformatMessage(record, collectionSearchFields) // <-- top level reformat method! + def result = TransformationUtils.reformatMessageForSearch(record, collectionSearchFields) // <-- top level reformat method! then: result.individualNames == [] as Set @@ -407,7 +456,7 @@ class TransformationUtilsSpec extends Specification { def "accession values are not included"() { when: - def result = TransformationUtils.reformatMessage(TestUtils.inputAvroRecord, collectionSearchFields) + def result = TransformationUtils.reformatMessageForSearch(TestUtils.inputAvroRecord, collectionSearchFields) then: result.accessionValues == null From 9a136f904330b9527eb5ee0ab344ae97ba24c91b Mon Sep 17 00:00:00 2001 From: Zeb Date: Wed, 17 Jun 2020 13:27:06 -0600 Subject: [PATCH 20/29] Handle differences between granule and collection index. --- .../cedar/onestop/indexer/util/IndexingInput.java | 4 ++++ .../cedar/onestop/indexer/util/IndexingUtils.java | 2 +- .../onestop/indexer/util/TransformationUtils.java | 12 ++++++------ .../indexer/util/TransformationUtilsSpec.groovy | 2 +- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java index fcfe59659..ceb79da2b 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingInput.java @@ -72,6 +72,10 @@ public Set getTargetAnalysisAndErrorsIndexFields() { } } + public RecordType getRecordType() { + return recordType; + } + @Override public String toString() { return "IndexingInput {" + diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java index fbb1a5594..764b39526 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/IndexingUtils.java @@ -76,7 +76,7 @@ public static DocWriteRequest buildAnalysisAndErrorWriteRequest(String indexN } else { var formattedRecord = new HashMap(); - formattedRecord.putAll(TransformationUtils.reformatMessageForAnalysis(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields())); + formattedRecord.putAll(TransformationUtils.reformatMessageForAnalysis(input.getValue().value(), input.getTargetAnalysisAndErrorsIndexFields(), input.getRecordType())); formattedRecord.put("stagedDate", input.getValue().timestamp()); return new IndexRequest(indexName).opType(opType).id(input.getKey()).source(formattedRecord); } diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index 9f395fa2e..27434082e 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -32,7 +32,7 @@ public class TransformationUtils { /////////////////////////////////////////////////////////////////////////////// // Convert to Indexing Message // /////////////////////////////////////////////////////////////////////////////// - public static Map reformatMessageForAnalysis(ParsedRecord record, Set fields) { + public static Map reformatMessageForAnalysis(ParsedRecord record, Set fields, RecordType recordType) { var analysis = record.getAnalysis(); var errors = record.getErrors(); @@ -57,13 +57,13 @@ public static Map reformatMessageForAnalysis(ParsedRecord record message.put("temporalBounding", prepareTemporalBounding(analysis.getTemporalBounding())); } if (fields.contains("identification")) { - message.put("identification", prepareIdentification(analysis.getIdentification())); + message.put("identification", prepareIdentification(analysis.getIdentification(), recordType)); } return message; } - public static Map prepareIdentification(IdentificationAnalysis identification) { + public static Map prepareIdentification(IdentificationAnalysis identification, RecordType recordType) { var result = new HashMap(); var analysis = AvroUtils.avroToMap(identification); // TODO using map because I need javadocs on the IdentificationAnalysis object... @@ -77,9 +77,9 @@ public static Map prepareIdentification(IdentificationAnalysis i result.put("hierarchyLevelNameExists", analysis.get("hierarchyLevelNameExists")); result.put("isGranule", analysis.get("isGranule")); result.put("parentIdentifierExists", analysis.get("parentIdentifierExists")); - // if ((Boolean)analysis.get("isGranule")) { FIXME - // result.put("parentIdentifierString", analysis.get("parentIdentifierString")); - // } + if (recordType == RecordType.granule) { + result.put("parentIdentifierString", analysis.get("parentIdentifierString")); + } return result; } diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index 74eec7759..00db07fa7 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -118,7 +118,7 @@ class TransformationUtilsSpec extends Specification { ) .build() - def indexedRecord = TransformationUtils.reformatMessageForAnalysis(record, fields) + def indexedRecord = TransformationUtils.reformatMessageForAnalysis(record, fields, RecordType.granule) then: From a9dab1c96887722ca1cbcbd377d347e681c30b56 Mon Sep 17 00:00:00 2001 From: Zeb Date: Mon, 22 Jun 2020 14:59:53 -0600 Subject: [PATCH 21/29] Refactor date parsing based on moving (some) logic to analysis. TODO: more of the logic really belongs in analysis. --- .../indexer/util/TransformationUtils.java | 303 +++++++++++++++--- .../util/TransformationUtilsSpec.groovy | 59 +++- 2 files changed, 299 insertions(+), 63 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index 27434082e..de501f5db 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -17,6 +17,9 @@ import static org.cedar.schemas.avro.psi.ValidDescriptor.UNDEFINED; import static org.cedar.schemas.avro.psi.ValidDescriptor.VALID; +// import org.cedar.schemas.analyze.Temporal; +// +// import java.time.temporal.ChronoField; import org.cedar.onestop.kafka.common.util.DataUtils; @@ -359,93 +362,293 @@ private static Map prepareTemporalBounding(TemporalBoundingAnaly return result; } - private static Map prepareDates(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { + private static Map prepareDatesForInstant(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { String beginDate, endDate; - Long year; Long beginYear, endYear; - int beginDayOfYear, beginDayOfMonth, beginMonth; - int endDayOfYear, endDayOfMonth, endMonth; + Integer beginDayOfYear, beginDayOfMonth, beginMonth; + Integer endDayOfYear, endDayOfMonth, endMonth; var result = new HashMap(); - // If bounding is actually an instant, set search fields accordingly - if (analysis.getRangeDescriptor() == TimeRangeDescriptor.INSTANT && analysis.getBeginDescriptor() == UNDEFINED) { + if (!analysis.getInstantIndexable()) { + // paleo dates are not indexable, so don't add beginDate or endDate to the index + beginDate = null; + endDate = null; + beginDayOfYear = null; + beginDayOfMonth = null; + beginMonth = null; + endDayOfYear = null; + endDayOfMonth = null; + endMonth = null; + } else { beginDate = analysis.getInstantUtcDateTimeString(); - year = parseYear(beginDate); - - // Add time and/or date to endDate based on precision var precision = analysis.getInstantPrecision(); if (precision.equals(ChronoUnit.DAYS.toString())) { // End of day endDate = bounding.getInstant() + "T23:59:59Z"; } else if (precision.equals(ChronoUnit.YEARS.toString())) { - if (!analysis.getInstantIndexable()) { - // Paleo date, so only return year value (null out dates) - beginDate = null; - endDate = null; - } else { - // Last day of year + end of day - endDate = bounding.getInstant() + "-12-31T23:59:59Z"; - } + // Last day of year + end of day + endDate = bounding.getInstant() + "-12-31T23:59:59Z"; } else { // Precision is NANOS so use instant value as-is + // TODO hopefully it's not a weird rare precision like "month" endDate = beginDate; } - beginYear = year; - endYear = year; - } else { - // If dates exist (thus VALID) and are indexable use value from analysis block where dates are UTC datetime normalized, - // else only set the year values as this is indicative of a paleo date - beginDate = analysis.getBeginDescriptor() == VALID && analysis.getBeginIndexable() ? analysis.getBeginUtcDateTimeString() : null; - beginYear = parseYear(analysis.getBeginUtcDateTimeString()); - endDate = analysis.getEndDescriptor() == VALID && analysis.getEndIndexable() ? analysis.getEndUtcDateTimeString() : null; - endYear = parseYear(analysis.getEndUtcDateTimeString()); + + if (analysis.getInstantDayOfYear() != null) { + beginDayOfYear = analysis.getInstantDayOfYear(); + endDayOfYear = analysis.getInstantDayOfYear(); + } else { + beginDayOfYear = 1; + endDayOfYear = 365; // TODO leap year + } + + if (analysis.getInstantDayOfMonth() != null) { + beginDayOfMonth = analysis.getInstantDayOfMonth(); + endDayOfMonth = analysis.getInstantDayOfMonth(); + } else { + beginDayOfMonth = 1; + endDayOfMonth = 31; // TODO depends on if there is a month but no day, but for the moment I'm assuming it's Year or Day precision, but not month... + } + + if (analysis.getInstantMonth() != null) { + beginMonth = analysis.getInstantMonth(); + endMonth = analysis.getInstantMonth(); + } else { + beginMonth = 1; + endMonth = 12; + } } + beginYear = analysis.getInstantYear(); + endYear = analysis.getInstantYear(); + result.put("beginDate", beginDate); result.put("beginYear", beginYear); - result.putAll(parseAdditionalTimeFields("begin", beginDate)); + result.put("beginDayOfYear", beginDayOfYear); + result.put("beginDayOfMonth", beginDayOfMonth); + result.put("beginMonth", beginMonth); result.put("endDate", endDate); result.put("endYear", endYear); - result.putAll(parseAdditionalTimeFields("end", endDate)); + result.put("endDayOfYear", endDayOfYear); + result.put("endDayOfMonth", endDayOfMonth); + result.put("endMonth", endMonth); return result; } - private static HashMap parseAdditionalTimeFields(String prefix, String time){ + private static Map prepareBeginDate(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { var result = new HashMap(); - try { + Integer beginDayOfYear, beginDayOfMonth, beginMonth; - Integer dayOfYear, dayOfMonth, month; - if (time != null) { - ZonedDateTime dateTime = ZonedDateTime.parse(time); + if (analysis.getBeginDescriptor() == VALID) { + if (analysis.getBeginIndexable()) { + result.put("beginDate", analysis.getBeginUtcDateTimeString()); - dayOfYear = dateTime.getDayOfYear(); - dayOfMonth = dateTime.getDayOfMonth(); - month = dateTime.getMonthValue(); - } - else { - dayOfYear = null; - dayOfMonth = null; - month = null; + var precision = analysis.getBeginPrecision(); + + // if (Temporal.extractField(parsedDate, ChronoField.DAY_OF_YEAR) != null) { + if (precision.equals(ChronoUnit.DAYS.toString()) || precision.equals(ChronoUnit.NANOS.toString())) { + beginDayOfYear = analysis.getBeginDayOfYear(); + beginDayOfMonth = analysis.getBeginDayOfMonth(); + beginMonth = analysis.getBeginMonth(); + } + else { + beginDayOfYear = 1; + beginDayOfMonth = 1; + beginMonth = 1; // TODO base off month precision, if applicable + } + + } else { + beginDayOfYear = null; + beginDayOfMonth = null; + beginMonth = null; } + result.put("beginYear", analysis.getBeginYear()); + + // if (precision.equals(ChronoUnit.DAYS.toString())) { + // beginDayOfYear = analysis.getBeginDayOfYear(); + // beginDayOfMonth = analysis.getBeginDayOfMonth(); + // beginMonth = analysis.getBeginMonth(); + // } + // // else { + // // beginDayOfYear = 1; + // // beginDayOfMonth = 1; + // // beginMonth = 1; // TODO base off month precision, if applicable + // // } + // else { + // beginDayOfYear = null; + // beginDayOfMonth = null; + // beginMonth = null; + // } + + result.put("beginDayOfYear", beginDayOfYear); + result.put("beginDayOfMonth", beginDayOfMonth); + result.put("beginMonth", beginMonth); + } + return result; + } + + private static Map prepareEndDate(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { + var result = new HashMap(); + Integer endDayOfYear, endDayOfMonth, endMonth; + + if (analysis.getEndDescriptor() == VALID) { + if (analysis.getEndIndexable()) { + result.put("endDate", analysis.getEndUtcDateTimeString()); - result.put(prefix + "DayOfYear", dayOfYear); - result.put(prefix + "DayOfMonth", dayOfMonth); - result.put(prefix + "Month", month); - } catch (Exception e) {} // TODO temporary + var precision = analysis.getEndPrecision(); + if (precision.equals(ChronoUnit.DAYS.toString())) { + endDayOfYear = analysis.getEndDayOfYear(); + endDayOfMonth = analysis.getEndDayOfMonth(); + endMonth = analysis.getEndMonth(); + } + else { // TODO this implies other precision checks (begin date) are also needed + endDayOfYear = 365; // TODO leap years + endDayOfMonth = 31; // TODO base off month precision, if applicable + endMonth = 12; // TODO base off month precision, if applicable + } + } else { + endDayOfYear = null; + endDayOfMonth = null; + endMonth = null; + } + result.put("endYear", analysis.getEndYear()); + + + // if (precision.equals(ChronoUnit.DAYS.toString())) { + // endDayOfYear = analysis.getEndDayOfYear(); + // endDayOfMonth = analysis.getEndDayOfMonth(); + // endMonth = analysis.getEndMonth(); + // } + // // else { // TODO this implies other precision checks (begin date) are also needed + // // endDayOfYear = 365; // TODO leap years + // // endDayOfMonth = 31; // TODO base off month precision, if applicable + // // endMonth = 12; // TODO base off month precision, if applicable + // // } + // else { + // endDayOfYear = null; + // endDayOfMonth = null; + // endMonth = null; + // } + + result.put("endDayOfYear", endDayOfYear); + result.put("endDayOfMonth", endDayOfMonth); + result.put("endMonth", endMonth); + } return result; } - private static Long parseYear(String utcDateTime) { - if (StringUtils.isBlank(utcDateTime)) { - return null; + private static Map prepareDates(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { + String beginDate, endDate; + // Long year; + Long beginYear, endYear; + Integer beginDayOfYear, beginDayOfMonth, beginMonth; + Integer endDayOfYear, endDayOfMonth, endMonth; + var result = new HashMap(); + + // If bounding is actually an instant, set search fields accordingly + if (analysis.getRangeDescriptor() == TimeRangeDescriptor.INSTANT && analysis.getBeginDescriptor() == UNDEFINED) { + return prepareDatesForInstant(bounding, analysis); } else { - // Watch out for BCE years - return Long.parseLong(utcDateTime.substring(0, utcDateTime.indexOf('-', 1))); + // If dates exist (thus VALID) and are indexable use value from analysis block where dates are UTC datetime normalized, + // else only set the year values as this is indicative of a paleo date TODO does this all behave the same now? + + result.putAll(prepareBeginDate(bounding, analysis)); + result.putAll(prepareEndDate(bounding, analysis)); + // if (analysis.getBeginDescriptor() == VALID && analysis.getBeginIndexable()) { + // beginDate = analysis.getBeginUtcDateTimeString(); + // beginYear = analysis.getBeginYear(); + // var precision = analysis.getBeginPrecision(); + // if (precision.equals(ChronoUnit.DAYS.toString())) { + // beginDayOfYear = analysis.getBeginDayOfYear(); + // beginDayOfMonth = analysis.getBeginDayOfMonth(); + // beginMonth = analysis.getBeginMonth(); + // } else { + // beginDayOfYear = 1; + // beginDayOfMonth = 1; + // beginMonth = 1; // TODO base off month precision, if applicable + // } + // } else { + // beginDate = null; + // beginYear = null; + // beginDayOfYear = null; + // beginDayOfMonth = null; + // beginMonth = null; + // } + // if (analysis.getEndDescriptor() == VALID && analysis.getEndIndexable()) { + // endDate = analysis.getEndUtcDateTimeString(); + // endYear = analysis.getEndYear(); + // var precision = analysis.getEndPrecision(); + // if (precision.equals(ChronoUnit.DAYS.toString())) { + // endDayOfYear = analysis.getEndDayOfYear(); + // endDayOfMonth = analysis.getEndDayOfMonth(); + // endMonth = analysis.getEndMonth(); + // } else { // TODO this implies other precision checks (begin date) are also needed + // endDayOfYear = 365; // TODO leap years + // endDayOfMonth = 31; // TODO base off month precision, if applicable + // endMonth = 12; // TODO base off month precision, if applicable + // } + // } else { + // endDate = null; + // endYear = null; + // endDayOfYear = null; + // endDayOfMonth = null; + // endMonth = null; + // } } + + // result.put("beginDate", beginDate); + // result.put("beginYear", beginYear); + // result.put("beginDayOfYear", beginDayOfYear); + // result.put("beginDayOfMonth", beginDayOfMonth); + // result.put("beginMonth", beginMonth); + // result.putAll(parseAdditionalTimeFields("begin", beginDate)); + + // result.put("endDate", endDate); + // result.put("endYear", endYear); + // result.put("endDayOfYear", endDayOfYear); + // result.put("endDayOfMonth", endDayOfMonth); + // result.put("endMonth", endMonth); + // result.putAll(parseAdditionalTimeFields("end", endDate)); + + return result; } + // private static HashMap parseAdditionalTimeFields(String prefix, String time){ + // var result = new HashMap(); + // try { + // + // Integer dayOfYear, dayOfMonth, month; + // if (time != null) { + // ZonedDateTime dateTime = ZonedDateTime.parse(time); + // + // dayOfYear = dateTime.getDayOfYear(); + // dayOfMonth = dateTime.getDayOfMonth(); + // month = dateTime.getMonthValue(); + // } + // else { + // dayOfYear = null; + // dayOfMonth = null; + // month = null; + // } + // + // result.put(prefix + "DayOfYear", dayOfYear); + // result.put(prefix + "DayOfMonth", dayOfMonth); + // result.put(prefix + "Month", month); + // } catch (Exception e) {} // TODO temporary + // return result; + // } + + // private static Long parseYear(String utcDateTime) { + // if (StringUtils.isBlank(utcDateTime)) { + // return null; + // } else { + // // Watch out for BCE years + // return Long.parseLong(utcDateTime.substring(0, utcDateTime.indexOf('-', 1))); + // } + // } + //////////////////////////// // Keywords // //////////////////////////// diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index 00db07fa7..b0792a5e2 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -349,25 +349,58 @@ class TransformationUtilsSpec extends Specification { //////////////////////////// // Dates // //////////////////////////// - def "When #situation.description, expected temporal bounding generated"() { + def "when #label, expected temporal bounding generated"() { when: - def newTimeMetadata = TransformationUtils.prepareDates(situation.bounding, situation.analysis) - + def discovery = Discovery.newBuilder().setTemporalBounding(input).build() + def newTimeMetadata = TransformationUtils.prepareDates(input, Temporal.analyzeBounding(discovery)) + println("debug"+label) + println(Temporal.analyzeBounding(discovery)) then: - newTimeMetadata.sort() == expectedResult + newTimeMetadata.beginDate == beginDate + newTimeMetadata.beginYear == beginYear + newTimeMetadata.beginDayOfYear == beginDayOfYear + newTimeMetadata.beginDayOfMonth == beginDayOfMonth + newTimeMetadata.beginMonth == beginMonth + newTimeMetadata.endDate == endDate + newTimeMetadata.endYear == endYear + newTimeMetadata.endDayOfYear == endDayOfYear + newTimeMetadata.endDayOfMonth == endDayOfMonth + newTimeMetadata.endMonth == endMonth where: - situation | expectedResult - situations.instantDay | [beginDate: '1999-12-31T00:00:00Z', beginYear: 1999, beginDayOfYear: 365, beginDayOfMonth: 31, beginMonth: 12, endDate: '1999-12-31T23:59:59Z', endYear: 1999, endDayOfYear:365, endDayOfMonth:31, endMonth:12].sort() - situations.instantYear | [beginDate: '1999-01-01T00:00:00Z', beginYear: 1999, beginDayOfYear: 1, beginDayOfMonth:1, beginMonth: 1, endDate: '1999-12-31T23:59:59Z', endYear: 1999, endDayOfMonth:31, endDayOfYear:365, endMonth:12].sort() - situations.instantPaleo | [beginDate: null, endDate: null, beginYear: -1000000000, endYear: -1000000000, beginDayOfYear: null, beginDayOfMonth:null, beginMonth: null, endDayOfYear: null, endDayOfMonth:null, endMonth:null].sort() - situations.instantNano | [beginDate: '2008-04-01T00:00:00Z', beginYear: 2008, beginDayOfYear: 92, beginDayOfMonth:1, beginMonth: 4, endDate: '2008-04-01T00:00:00Z', endYear: 2008, endDayOfYear: 92, endDayOfMonth:1, endMonth:4].sort() - situations.bounded | [beginDate: '1900-01-01T00:00:00Z', beginYear: 1900, beginDayOfYear: 1, beginDayOfMonth:1, beginMonth: 1, endDate: '2009-12-31T23:59:59Z', endYear: 2009, endDayOfYear:365, endDayOfMonth:31, endMonth:12].sort() - situations.paleoBounded | [beginDate: null, endDate: null, beginYear: -2000000000, endYear: -1000000000, beginDayOfYear: null, beginDayOfMonth:null, beginMonth: null, endDayOfYear: null, endDayOfMonth:null, endMonth:null].sort() - situations.ongoing | [beginDate: "1975-06-15T12:30:00Z", beginDayOfMonth:15, beginDayOfYear:166, beginMonth:6, beginYear:1975, endDate:null, endYear:null, endDayOfYear: null, endDayOfMonth: null, endMonth: null].sort() - situations.empty | [beginDate: null, endDate: null, beginYear: null, endYear: null, beginDayOfYear: null, beginDayOfMonth:null, beginMonth: null, endDayOfYear: null, endDayOfMonth:null, endMonth:null].sort() + label | input | beginDate | beginYear | beginDayOfYear | beginDayOfMonth | beginMonth | endDate | endYear | endDayOfYear | endDayOfMonth | endMonth + + "undefined range" | TemporalBounding.newBuilder().build() | null | null | null | null | null | null | null | null | null | null + "non-paleo bounded range with day and year precision" | TemporalBounding.newBuilder().setBeginDate('1900-01-01').setEndDate('2009').build() | '1900-01-01T00:00:00Z' | 1900 | 1 | 1 | 1 | '2009-12-31T23:59:59.999Z' | 2009 | 365 | 31 | 12 // TODO does this assumption re end date make sense, really? TODO had to add the .999 to endDate - why and is that good/bad/other? + "paleo bounded range" | TemporalBounding.newBuilder().setBeginDate('-2000000000').setEndDate('-1000000000').build() | null | -2000000000 | null | null | null | null | -1000000000 | null | null | null + "ongoing range with second precision for begin" | TemporalBounding.newBuilder().setBeginDate('1975-06-15T12:30:00Z').build() | "1975-06-15T12:30:00Z" | 1975 | 166 | 15 | 6 | null | null | null | null | null + // INSTANTS: + "non-paleo instant with years precision" | TemporalBounding.newBuilder().setInstant('1999').build() | '1999-01-01T00:00:00Z' | 1999 | 1 | 1 | 1 | '1999-12-31T23:59:59Z' | 1999 | 365 | 31 | 12 + "non-paleo instant with days precision" | TemporalBounding.newBuilder().setInstant('1999-12-31').build() | '1999-12-31T00:00:00Z' | 1999 | 365 | 31 | 12 | '1999-12-31T23:59:59Z' | 1999 | 365 | 31 | 12 + "paleo instant with years precision" | TemporalBounding.newBuilder().setInstant('-1000000000').build() | null | -1000000000 | null | null | null | null | -1000000000 | null | null | null // TODO I think this is a bug in analysis, that it doesn't populate instantYears + "non-paleo instant with nanos precision" | TemporalBounding.newBuilder().setInstant('2008-04-01T00:00:00Z').build() | '2008-04-01T00:00:00Z' | 2008 | 92 | 1 | 4 | '2008-04-01T00:00:00Z' | 2008 | 92 | 1 | 4 } + // def "When #situation.description, expected temporal bounding generated"() { + // when: + // def discovery = Discovery.newBuilder().setTemporalBounding(situation.bounding).build() + // def newTimeMetadata = TransformationUtils.prepareDates(situation.bounding, Temporal.analyzeBounding(discovery)) + // + // then: + // newTimeMetadata.sort() == expectedResult + // + // where: + // situation | expectedResult + // // situations.instantDay | [beginDate: '1999-12-31T00:00:00Z', beginYear: 1999, beginDayOfYear: 365, beginDayOfMonth: 31, beginMonth: 12, endDate: '1999-12-31T23:59:59Z', endYear: 1999, endDayOfYear:365, endDayOfMonth:31, endMonth:12].sort() + // // situations.instantYear | [beginDate: '1999-01-01T00:00:00Z', beginYear: 1999, beginDayOfYear: 1, beginDayOfMonth:1, beginMonth: 1, endDate: '1999-12-31T23:59:59Z', endYear: 1999, endDayOfMonth:31, endDayOfYear:365, endMonth:12].sort() + // // situations.instantPaleo | [beginDate: null, endDate: null, beginYear: -1000000000, endYear: -1000000000, beginDayOfYear: null, beginDayOfMonth:null, beginMonth: null, endDayOfYear: null, endDayOfMonth:null, endMonth:null].sort() + // // situations.instantNano | [beginDate: '2008-04-01T00:00:00Z', beginYear: 2008, beginDayOfYear: 92, beginDayOfMonth:1, beginMonth: 4, endDate: '2008-04-01T00:00:00Z', endYear: 2008, endDayOfYear: 92, endDayOfMonth:1, endMonth:4].sort() + // // situations.bounded | [beginDate: '1900-01-01T00:00:00Z', beginYear: 1900, beginDayOfYear: 1, beginDayOfMonth:1, beginMonth: 1, endDate: '2009-12-31T23:59:59Z', endYear: 2009, endDayOfYear:365, endDayOfMonth:31, endMonth:12].sort() // TODO does this assumption re end date make sense, really? + // // situations.paleoBounded | [beginDate: null, endDate: null, beginYear: -2000000000, endYear: -1000000000, beginDayOfYear: null, beginDayOfMonth:null, beginMonth: null, endDayOfYear: null, endDayOfMonth:null, endMonth:null].sort() + // // situations.ongoing | [beginDate: "1975-06-15T12:30:00Z", beginDayOfMonth:15, beginDayOfYear:166, beginMonth:6, beginYear:1975, endDate:null, endYear:null, endDayOfYear: null, endDayOfMonth: null, endMonth: null].sort() + // situations.empty | [beginDate: null, endDate: null, beginYear: null, endYear: null, beginDayOfYear: null, beginDayOfMonth:null, beginMonth: null, endDayOfYear: null, endDayOfMonth:null, endMonth:null].sort() + // } + def "temporal bounding with #testCase dates is prepared correctly"() { given: def bounding = TemporalBounding.newBuilder().setBeginDate(begin).setEndDate(end).build() From ca62c8159c5caf4ab1340e24b952f023c4fb59ef Mon Sep 17 00:00:00 2001 From: Erin Date: Tue, 23 Jun 2020 12:09:21 -0600 Subject: [PATCH 22/29] Added null check for passed in analysis in prepareTemporalBounding --- .../org/cedar/onestop/indexer/util/TransformationUtils.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index 27434082e..46105be49 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -340,6 +340,11 @@ private static Map> prepareResponsibleParties(ParsedRecord r private static Map prepareTemporalBounding(TemporalBoundingAnalysis analysis) { var result = new HashMap(); + + if (analysis == null) { + return result; + } + result.put("beginDescriptor", analysis.getBeginDescriptor()); result.put("beginIndexable", analysis.getBeginIndexable()); result.put("beginPrecision", analysis.getBeginPrecision()); From bd73edb3d1e6b29a37f7c40a049a4c80818f9a5d Mon Sep 17 00:00:00 2001 From: Erin Date: Tue, 23 Jun 2020 12:12:10 -0600 Subject: [PATCH 23/29] npm run format --- client/src/components/collections/detail/GranulesSummary.jsx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/client/src/components/collections/detail/GranulesSummary.jsx b/client/src/components/collections/detail/GranulesSummary.jsx index 5ba814d26..2e8b3f589 100644 --- a/client/src/components/collections/detail/GranulesSummary.jsx +++ b/client/src/components/collections/detail/GranulesSummary.jsx @@ -48,7 +48,9 @@ export default class GranulesSummary extends React.Component { const linkText = loading || totalGranuleFilteredCount == totalGranuleCount - ? `Show all ${totalGranuleCount? totalGranuleCount.toLocaleString(): '0'} files in collection` + ? `Show all ${totalGranuleCount + ? totalGranuleCount.toLocaleString() + : '0'} files in collection` : `Show ${totalGranuleFilteredCount} matching files of ${totalGranuleCount} in collection` // TODO 508 this should probably be a link, not a button From 65b0a3a17b4b0dbc3925bf5a924d051f81c111fd Mon Sep 17 00:00:00 2001 From: Erin Date: Tue, 23 Jun 2020 12:21:16 -0600 Subject: [PATCH 24/29] Added test reformatMessageForAnalysis output missing parentIdentifierString --- .../util/TransformationUtilsSpec.groovy | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index 00db07fa7..f0544a2bd 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -3,6 +3,7 @@ package org.cedar.onestop.indexer.util import org.cedar.schemas.analyze.Analyzers import org.cedar.schemas.analyze.Temporal import org.cedar.schemas.avro.psi.Analysis +import org.cedar.schemas.avro.psi.IdentificationAnalysis import org.cedar.schemas.avro.psi.TemporalBoundingAnalysis import org.cedar.schemas.avro.psi.ValidDescriptor import org.cedar.schemas.avro.psi.Checksum @@ -137,6 +138,34 @@ class TransformationUtilsSpec extends Specification { } + def "reformatMessageForAnalysis populates #label"() { + String identifier = 'gov.noaa.nodc:0173643' + when: + def identificationAnalysis = IdentificationAnalysis.newBuilder() + .setFileIdentifierExists(true) + .setDoiExists(false) + .setParentIdentifierString(identifier) + .build() + def analysis = Analysis.newBuilder().setIdentification(identificationAnalysis).build() + ParsedRecord record = ParsedRecord.newBuilder().setType(type).setAnalysis(analysis).build() + + def indexedRecord = TransformationUtils.reformatMessageForAnalysis(record, fields, RecordType.granule) + + then: + println(label) + println(JsonOutput.toJson(AvroUtils.avroToMap(record.getAnalysis(), true))) + println(JsonOutput.toJson(indexedRecord)) + indexedRecord.each { + key, value -> println ("key=$key value=$value") + } + + indexedRecord?.identification?.parentIdentifierString == identifier + + where: + label | fields | type + 'collections with parentIdentifierString' | collectionAnalysisErrorFields | RecordType.collection + 'granules with parentIdentifierString' | granuleAnalysisErrorFields | RecordType.granule + } def "reformatMessageForSearch populates with correct fields for #label"() { when: From 95a943acb11510c6af14bb80b6c9397cb34aa430 Mon Sep 17 00:00:00 2001 From: Zeb Date: Fri, 26 Jun 2020 15:31:45 -0600 Subject: [PATCH 25/29] Move more logic into analysis - TODO some things may still be renamed. --- .../indexer/util/TransformationUtils.java | 224 ++---------------- .../util/TransformationUtilsSpec.groovy | 40 +--- 2 files changed, 28 insertions(+), 236 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index de501f5db..b67818e03 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -17,9 +17,6 @@ import static org.cedar.schemas.avro.psi.ValidDescriptor.UNDEFINED; import static org.cedar.schemas.avro.psi.ValidDescriptor.VALID; -// import org.cedar.schemas.analyze.Temporal; -// -// import java.time.temporal.ChronoField; import org.cedar.onestop.kafka.common.util.DataUtils; @@ -373,52 +370,17 @@ private static Map prepareDatesForInstant(TemporalBounding bound // paleo dates are not indexable, so don't add beginDate or endDate to the index beginDate = null; endDate = null; - beginDayOfYear = null; - beginDayOfMonth = null; - beginMonth = null; - endDayOfYear = null; - endDayOfMonth = null; - endMonth = null; } else { beginDate = analysis.getInstantUtcDateTimeString(); - var precision = analysis.getInstantPrecision(); - if (precision.equals(ChronoUnit.DAYS.toString())) { - // End of day - endDate = bounding.getInstant() + "T23:59:59Z"; - } else if (precision.equals(ChronoUnit.YEARS.toString())) { - // Last day of year + end of day - endDate = bounding.getInstant() + "-12-31T23:59:59Z"; - } else { - // Precision is NANOS so use instant value as-is - // TODO hopefully it's not a weird rare precision like "month" - endDate = beginDate; - } - - if (analysis.getInstantDayOfYear() != null) { - beginDayOfYear = analysis.getInstantDayOfYear(); - endDayOfYear = analysis.getInstantDayOfYear(); - } else { - beginDayOfYear = 1; - endDayOfYear = 365; // TODO leap year - } - - if (analysis.getInstantDayOfMonth() != null) { - beginDayOfMonth = analysis.getInstantDayOfMonth(); - endDayOfMonth = analysis.getInstantDayOfMonth(); - } else { - beginDayOfMonth = 1; - endDayOfMonth = 31; // TODO depends on if there is a month but no day, but for the moment I'm assuming it's Year or Day precision, but not month... - } - - if (analysis.getInstantMonth() != null) { - beginMonth = analysis.getInstantMonth(); - endMonth = analysis.getInstantMonth(); - } else { - beginMonth = 1; - endMonth = 12; - } + endDate = analysis.getInstantEndUtcDateTimeString(); } + beginDayOfYear = analysis.getInstantDayOfYear(); + endDayOfYear = analysis.getInstantEndDayOfYear(); + beginMonth = analysis.getInstantMonth(); + endMonth = analysis.getInstantEndMonth(); + beginDayOfMonth = analysis.getInstantDayOfMonth(); + endDayOfMonth = analysis.getInstantEndDayOfMonth(); beginYear = analysis.getInstantYear(); endYear = analysis.getInstantYear(); @@ -444,44 +406,12 @@ private static Map prepareBeginDate(TemporalBounding bounding, T if (analysis.getBeginDescriptor() == VALID) { if (analysis.getBeginIndexable()) { result.put("beginDate", analysis.getBeginUtcDateTimeString()); - - var precision = analysis.getBeginPrecision(); - - // if (Temporal.extractField(parsedDate, ChronoField.DAY_OF_YEAR) != null) { - if (precision.equals(ChronoUnit.DAYS.toString()) || precision.equals(ChronoUnit.NANOS.toString())) { - beginDayOfYear = analysis.getBeginDayOfYear(); - beginDayOfMonth = analysis.getBeginDayOfMonth(); - beginMonth = analysis.getBeginMonth(); - } - else { - beginDayOfYear = 1; - beginDayOfMonth = 1; - beginMonth = 1; // TODO base off month precision, if applicable - } - - } else { - beginDayOfYear = null; - beginDayOfMonth = null; - beginMonth = null; } - result.put("beginYear", analysis.getBeginYear()); - - // if (precision.equals(ChronoUnit.DAYS.toString())) { - // beginDayOfYear = analysis.getBeginDayOfYear(); - // beginDayOfMonth = analysis.getBeginDayOfMonth(); - // beginMonth = analysis.getBeginMonth(); - // } - // // else { - // // beginDayOfYear = 1; - // // beginDayOfMonth = 1; - // // beginMonth = 1; // TODO base off month precision, if applicable - // // } - // else { - // beginDayOfYear = null; - // beginDayOfMonth = null; - // beginMonth = null; - // } + beginDayOfYear = analysis.getBeginDayOfYear(); + beginDayOfMonth = analysis.getBeginDayOfMonth(); + beginMonth = analysis.getBeginMonth(); + result.put("beginYear", analysis.getBeginYear()); result.put("beginDayOfYear", beginDayOfYear); result.put("beginDayOfMonth", beginDayOfMonth); result.put("beginMonth", beginMonth); @@ -496,42 +426,12 @@ private static Map prepareEndDate(TemporalBounding bounding, Tem if (analysis.getEndDescriptor() == VALID) { if (analysis.getEndIndexable()) { result.put("endDate", analysis.getEndUtcDateTimeString()); - - var precision = analysis.getEndPrecision(); - if (precision.equals(ChronoUnit.DAYS.toString())) { - endDayOfYear = analysis.getEndDayOfYear(); - endDayOfMonth = analysis.getEndDayOfMonth(); - endMonth = analysis.getEndMonth(); - } - else { // TODO this implies other precision checks (begin date) are also needed - endDayOfYear = 365; // TODO leap years - endDayOfMonth = 31; // TODO base off month precision, if applicable - endMonth = 12; // TODO base off month precision, if applicable - } - } else { - endDayOfYear = null; - endDayOfMonth = null; - endMonth = null; } - result.put("endYear", analysis.getEndYear()); - - - // if (precision.equals(ChronoUnit.DAYS.toString())) { - // endDayOfYear = analysis.getEndDayOfYear(); - // endDayOfMonth = analysis.getEndDayOfMonth(); - // endMonth = analysis.getEndMonth(); - // } - // // else { // TODO this implies other precision checks (begin date) are also needed - // // endDayOfYear = 365; // TODO leap years - // // endDayOfMonth = 31; // TODO base off month precision, if applicable - // // endMonth = 12; // TODO base off month precision, if applicable - // // } - // else { - // endDayOfYear = null; - // endDayOfMonth = null; - // endMonth = null; - // } + endDayOfYear = analysis.getEndDayOfYear(); + endDayOfMonth = analysis.getEndDayOfMonth(); + endMonth = analysis.getEndMonth(); + result.put("endYear", analysis.getEndYear()); result.put("endDayOfYear", endDayOfYear); result.put("endDayOfMonth", endDayOfMonth); result.put("endMonth", endMonth); @@ -541,114 +441,22 @@ private static Map prepareEndDate(TemporalBounding bounding, Tem private static Map prepareDates(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { String beginDate, endDate; - // Long year; Long beginYear, endYear; Integer beginDayOfYear, beginDayOfMonth, beginMonth; Integer endDayOfYear, endDayOfMonth, endMonth; var result = new HashMap(); // If bounding is actually an instant, set search fields accordingly - if (analysis.getRangeDescriptor() == TimeRangeDescriptor.INSTANT && analysis.getBeginDescriptor() == UNDEFINED) { + if (analysis.getRangeDescriptor() == TimeRangeDescriptor.INSTANT && analysis.getBeginDescriptor() == UNDEFINED) { // distinguished getting begin and end date that were exactly the same (also described as instant) return prepareDatesForInstant(bounding, analysis); } else { - // If dates exist (thus VALID) and are indexable use value from analysis block where dates are UTC datetime normalized, - // else only set the year values as this is indicative of a paleo date TODO does this all behave the same now? - result.putAll(prepareBeginDate(bounding, analysis)); result.putAll(prepareEndDate(bounding, analysis)); - // if (analysis.getBeginDescriptor() == VALID && analysis.getBeginIndexable()) { - // beginDate = analysis.getBeginUtcDateTimeString(); - // beginYear = analysis.getBeginYear(); - // var precision = analysis.getBeginPrecision(); - // if (precision.equals(ChronoUnit.DAYS.toString())) { - // beginDayOfYear = analysis.getBeginDayOfYear(); - // beginDayOfMonth = analysis.getBeginDayOfMonth(); - // beginMonth = analysis.getBeginMonth(); - // } else { - // beginDayOfYear = 1; - // beginDayOfMonth = 1; - // beginMonth = 1; // TODO base off month precision, if applicable - // } - // } else { - // beginDate = null; - // beginYear = null; - // beginDayOfYear = null; - // beginDayOfMonth = null; - // beginMonth = null; - // } - // if (analysis.getEndDescriptor() == VALID && analysis.getEndIndexable()) { - // endDate = analysis.getEndUtcDateTimeString(); - // endYear = analysis.getEndYear(); - // var precision = analysis.getEndPrecision(); - // if (precision.equals(ChronoUnit.DAYS.toString())) { - // endDayOfYear = analysis.getEndDayOfYear(); - // endDayOfMonth = analysis.getEndDayOfMonth(); - // endMonth = analysis.getEndMonth(); - // } else { // TODO this implies other precision checks (begin date) are also needed - // endDayOfYear = 365; // TODO leap years - // endDayOfMonth = 31; // TODO base off month precision, if applicable - // endMonth = 12; // TODO base off month precision, if applicable - // } - // } else { - // endDate = null; - // endYear = null; - // endDayOfYear = null; - // endDayOfMonth = null; - // endMonth = null; - // } } - // result.put("beginDate", beginDate); - // result.put("beginYear", beginYear); - // result.put("beginDayOfYear", beginDayOfYear); - // result.put("beginDayOfMonth", beginDayOfMonth); - // result.put("beginMonth", beginMonth); - // result.putAll(parseAdditionalTimeFields("begin", beginDate)); - - // result.put("endDate", endDate); - // result.put("endYear", endYear); - // result.put("endDayOfYear", endDayOfYear); - // result.put("endDayOfMonth", endDayOfMonth); - // result.put("endMonth", endMonth); - // result.putAll(parseAdditionalTimeFields("end", endDate)); - return result; } - // private static HashMap parseAdditionalTimeFields(String prefix, String time){ - // var result = new HashMap(); - // try { - // - // Integer dayOfYear, dayOfMonth, month; - // if (time != null) { - // ZonedDateTime dateTime = ZonedDateTime.parse(time); - // - // dayOfYear = dateTime.getDayOfYear(); - // dayOfMonth = dateTime.getDayOfMonth(); - // month = dateTime.getMonthValue(); - // } - // else { - // dayOfYear = null; - // dayOfMonth = null; - // month = null; - // } - // - // result.put(prefix + "DayOfYear", dayOfYear); - // result.put(prefix + "DayOfMonth", dayOfMonth); - // result.put(prefix + "Month", month); - // } catch (Exception e) {} // TODO temporary - // return result; - // } - - // private static Long parseYear(String utcDateTime) { - // if (StringUtils.isBlank(utcDateTime)) { - // return null; - // } else { - // // Watch out for BCE years - // return Long.parseLong(utcDateTime.substring(0, utcDateTime.indexOf('-', 1))); - // } - // } - //////////////////////////// // Keywords // //////////////////////////// diff --git a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy index b0792a5e2..885b49441 100644 --- a/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy +++ b/indexer/src/test/groovy/org/cedar/onestop/indexer/util/TransformationUtilsSpec.groovy @@ -22,8 +22,6 @@ import groovy.json.JsonOutput import groovy.json.JsonSlurper import org.cedar.schemas.avro.util.AvroUtils -import static org.cedar.schemas.avro.util.TemporalTestData.getSituations - import org.cedar.onestop.kafka.common.util.DataUtils; @Unroll @@ -349,12 +347,14 @@ class TransformationUtilsSpec extends Specification { //////////////////////////// // Dates // //////////////////////////// + def "when #label, expected temporal bounding generated"() { when: def discovery = Discovery.newBuilder().setTemporalBounding(input).build() def newTimeMetadata = TransformationUtils.prepareDates(input, Temporal.analyzeBounding(discovery)) - println("debug"+label) - println(Temporal.analyzeBounding(discovery)) + + println("debug " + label + ": " + Temporal.analyzeBounding(discovery)) + then: newTimeMetadata.beginDate == beginDate newTimeMetadata.beginYear == beginYear @@ -371,36 +371,20 @@ class TransformationUtilsSpec extends Specification { label | input | beginDate | beginYear | beginDayOfYear | beginDayOfMonth | beginMonth | endDate | endYear | endDayOfYear | endDayOfMonth | endMonth "undefined range" | TemporalBounding.newBuilder().build() | null | null | null | null | null | null | null | null | null | null - "non-paleo bounded range with day and year precision" | TemporalBounding.newBuilder().setBeginDate('1900-01-01').setEndDate('2009').build() | '1900-01-01T00:00:00Z' | 1900 | 1 | 1 | 1 | '2009-12-31T23:59:59.999Z' | 2009 | 365 | 31 | 12 // TODO does this assumption re end date make sense, really? TODO had to add the .999 to endDate - why and is that good/bad/other? + "non-paleo bounded range with day and year precision" | TemporalBounding.newBuilder().setBeginDate('1900-01-01').setEndDate('2009').build() | '1900-01-01T00:00:00Z' | 1900 | 1 | 1 | 1 | '2009-12-31T23:59:59.999Z' | 2009 | 365 | 31 | 12 "paleo bounded range" | TemporalBounding.newBuilder().setBeginDate('-2000000000').setEndDate('-1000000000').build() | null | -2000000000 | null | null | null | null | -1000000000 | null | null | null "ongoing range with second precision for begin" | TemporalBounding.newBuilder().setBeginDate('1975-06-15T12:30:00Z').build() | "1975-06-15T12:30:00Z" | 1975 | 166 | 15 | 6 | null | null | null | null | null // INSTANTS: - "non-paleo instant with years precision" | TemporalBounding.newBuilder().setInstant('1999').build() | '1999-01-01T00:00:00Z' | 1999 | 1 | 1 | 1 | '1999-12-31T23:59:59Z' | 1999 | 365 | 31 | 12 - "non-paleo instant with days precision" | TemporalBounding.newBuilder().setInstant('1999-12-31').build() | '1999-12-31T00:00:00Z' | 1999 | 365 | 31 | 12 | '1999-12-31T23:59:59Z' | 1999 | 365 | 31 | 12 - "paleo instant with years precision" | TemporalBounding.newBuilder().setInstant('-1000000000').build() | null | -1000000000 | null | null | null | null | -1000000000 | null | null | null // TODO I think this is a bug in analysis, that it doesn't populate instantYears + "instant leapyear" | TemporalBounding.newBuilder().setInstant('2004').build() | '2004-01-01T00:00:00Z' | 2004 | 1 | 1 | 1 | '2004-12-31T23:59:59.999Z' | 2004 | 366 | 31 | 12 + "instant with month precision" | TemporalBounding.newBuilder().setInstant('1999-02').build() | '1999-02-01T00:00:00Z' | 1999 | 32 | 1 | 2 | '1999-02-28T23:59:59.999Z' | 1999 | 59 | 28 | 2 + "instant on leapyear with month precision" | TemporalBounding.newBuilder().setInstant('2004-02').build() | '2004-02-01T00:00:00Z' | 2004 | 32 | 1 | 2 | '2004-02-29T23:59:59.999Z' | 2004 | 60 | 29 | 2 + "instant set with begin and end date matching" | TemporalBounding.newBuilder().setBeginDate('1994-07-20T13:22:00Z').setEndDate('1994-07-20T13:22:00Z').build() | '1994-07-20T13:22:00Z' | 1994 | 201 | 20 | 7 | '1994-07-20T13:22:00Z' | 1994 | 201 | 20 | 7 + "non-paleo instant with years precision" | TemporalBounding.newBuilder().setInstant('1999').build() | '1999-01-01T00:00:00Z' | 1999 | 1 | 1 | 1 | '1999-12-31T23:59:59.999Z' | 1999 | 365 | 31 | 12 + "non-paleo instant with days precision" | TemporalBounding.newBuilder().setInstant('1999-12-31').build() | '1999-12-31T00:00:00Z' | 1999 | 365 | 31 | 12 | '1999-12-31T23:59:59.999Z' | 1999 | 365 | 31 | 12 + "paleo instant with years precision" | TemporalBounding.newBuilder().setInstant('-1000000000').build() | null | -1000000000 | null | null | null | null | -1000000000 | null | null | null "non-paleo instant with nanos precision" | TemporalBounding.newBuilder().setInstant('2008-04-01T00:00:00Z').build() | '2008-04-01T00:00:00Z' | 2008 | 92 | 1 | 4 | '2008-04-01T00:00:00Z' | 2008 | 92 | 1 | 4 } - // def "When #situation.description, expected temporal bounding generated"() { - // when: - // def discovery = Discovery.newBuilder().setTemporalBounding(situation.bounding).build() - // def newTimeMetadata = TransformationUtils.prepareDates(situation.bounding, Temporal.analyzeBounding(discovery)) - // - // then: - // newTimeMetadata.sort() == expectedResult - // - // where: - // situation | expectedResult - // // situations.instantDay | [beginDate: '1999-12-31T00:00:00Z', beginYear: 1999, beginDayOfYear: 365, beginDayOfMonth: 31, beginMonth: 12, endDate: '1999-12-31T23:59:59Z', endYear: 1999, endDayOfYear:365, endDayOfMonth:31, endMonth:12].sort() - // // situations.instantYear | [beginDate: '1999-01-01T00:00:00Z', beginYear: 1999, beginDayOfYear: 1, beginDayOfMonth:1, beginMonth: 1, endDate: '1999-12-31T23:59:59Z', endYear: 1999, endDayOfMonth:31, endDayOfYear:365, endMonth:12].sort() - // // situations.instantPaleo | [beginDate: null, endDate: null, beginYear: -1000000000, endYear: -1000000000, beginDayOfYear: null, beginDayOfMonth:null, beginMonth: null, endDayOfYear: null, endDayOfMonth:null, endMonth:null].sort() - // // situations.instantNano | [beginDate: '2008-04-01T00:00:00Z', beginYear: 2008, beginDayOfYear: 92, beginDayOfMonth:1, beginMonth: 4, endDate: '2008-04-01T00:00:00Z', endYear: 2008, endDayOfYear: 92, endDayOfMonth:1, endMonth:4].sort() - // // situations.bounded | [beginDate: '1900-01-01T00:00:00Z', beginYear: 1900, beginDayOfYear: 1, beginDayOfMonth:1, beginMonth: 1, endDate: '2009-12-31T23:59:59Z', endYear: 2009, endDayOfYear:365, endDayOfMonth:31, endMonth:12].sort() // TODO does this assumption re end date make sense, really? - // // situations.paleoBounded | [beginDate: null, endDate: null, beginYear: -2000000000, endYear: -1000000000, beginDayOfYear: null, beginDayOfMonth:null, beginMonth: null, endDayOfYear: null, endDayOfMonth:null, endMonth:null].sort() - // // situations.ongoing | [beginDate: "1975-06-15T12:30:00Z", beginDayOfMonth:15, beginDayOfYear:166, beginMonth:6, beginYear:1975, endDate:null, endYear:null, endDayOfYear: null, endDayOfMonth: null, endMonth: null].sort() - // situations.empty | [beginDate: null, endDate: null, beginYear: null, endYear: null, beginDayOfYear: null, beginDayOfMonth:null, beginMonth: null, endDayOfYear: null, endDayOfMonth:null, endMonth:null].sort() - // } - def "temporal bounding with #testCase dates is prepared correctly"() { given: def bounding = TemporalBounding.newBuilder().setBeginDate(begin).setEndDate(end).build() From 04f543497106084388453a89361e647010ba481a Mon Sep 17 00:00:00 2001 From: Zeb Date: Fri, 26 Jun 2020 15:36:41 -0600 Subject: [PATCH 26/29] cleanup --- .../indexer/util/TransformationUtils.java | 70 ++++++------------- 1 file changed, 20 insertions(+), 50 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index b67818e03..e8bf6aaf4 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -360,94 +360,64 @@ private static Map prepareTemporalBounding(TemporalBoundingAnaly } private static Map prepareDatesForInstant(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { - String beginDate, endDate; - Long beginYear, endYear; - Integer beginDayOfYear, beginDayOfMonth, beginMonth; - Integer endDayOfYear, endDayOfMonth, endMonth; var result = new HashMap(); - if (!analysis.getInstantIndexable()) { - // paleo dates are not indexable, so don't add beginDate or endDate to the index - beginDate = null; - endDate = null; - } else { - beginDate = analysis.getInstantUtcDateTimeString(); - endDate = analysis.getInstantEndUtcDateTimeString(); + if (analysis.getInstantIndexable()) { + // paleo dates are not indexable, so only add beginDate or endDate to the index if instantIndexable + result.put("beginDate", analysis.getInstantUtcDateTimeString()); + result.put("endDate", analysis.getInstantEndUtcDateTimeString()); } - beginDayOfYear = analysis.getInstantDayOfYear(); - endDayOfYear = analysis.getInstantEndDayOfYear(); - beginMonth = analysis.getInstantMonth(); - endMonth = analysis.getInstantEndMonth(); - beginDayOfMonth = analysis.getInstantDayOfMonth(); - endDayOfMonth = analysis.getInstantEndDayOfMonth(); - beginYear = analysis.getInstantYear(); - endYear = analysis.getInstantYear(); - - result.put("beginDate", beginDate); - result.put("beginYear", beginYear); - result.put("beginDayOfYear", beginDayOfYear); - result.put("beginDayOfMonth", beginDayOfMonth); - result.put("beginMonth", beginMonth); - - result.put("endDate", endDate); - result.put("endYear", endYear); - result.put("endDayOfYear", endDayOfYear); - result.put("endDayOfMonth", endDayOfMonth); - result.put("endMonth", endMonth); + result.put("beginYear", analysis.getInstantYear()); + result.put("beginDayOfYear", analysis.getInstantDayOfYear()); + result.put("beginDayOfMonth", analysis.getInstantDayOfMonth()); + result.put("beginMonth", analysis.getInstantMonth()); + + result.put("endYear", analysis.getInstantYear()); + result.put("endDayOfYear", analysis.getInstantEndDayOfYear()); + result.put("endDayOfMonth", analysis.getInstantEndDayOfMonth()); + result.put("endMonth", analysis.getInstantEndMonth()); return result; } private static Map prepareBeginDate(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { var result = new HashMap(); - Integer beginDayOfYear, beginDayOfMonth, beginMonth; if (analysis.getBeginDescriptor() == VALID) { if (analysis.getBeginIndexable()) { result.put("beginDate", analysis.getBeginUtcDateTimeString()); } - beginDayOfYear = analysis.getBeginDayOfYear(); - beginDayOfMonth = analysis.getBeginDayOfMonth(); - beginMonth = analysis.getBeginMonth(); result.put("beginYear", analysis.getBeginYear()); - result.put("beginDayOfYear", beginDayOfYear); - result.put("beginDayOfMonth", beginDayOfMonth); - result.put("beginMonth", beginMonth); + result.put("beginDayOfYear", analysis.getBeginDayOfYear()); + result.put("beginDayOfMonth", analysis.getBeginDayOfMonth()); + result.put("beginMonth", analysis.getBeginMonth()); } return result; } private static Map prepareEndDate(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { var result = new HashMap(); - Integer endDayOfYear, endDayOfMonth, endMonth; if (analysis.getEndDescriptor() == VALID) { if (analysis.getEndIndexable()) { result.put("endDate", analysis.getEndUtcDateTimeString()); } - endDayOfYear = analysis.getEndDayOfYear(); - endDayOfMonth = analysis.getEndDayOfMonth(); - endMonth = analysis.getEndMonth(); result.put("endYear", analysis.getEndYear()); - result.put("endDayOfYear", endDayOfYear); - result.put("endDayOfMonth", endDayOfMonth); - result.put("endMonth", endMonth); + result.put("endDayOfYear", analysis.getEndDayOfYear()); + result.put("endDayOfMonth", analysis.getEndDayOfMonth()); + result.put("endMonth", analysis.getEndMonth()); } return result; } private static Map prepareDates(TemporalBounding bounding, TemporalBoundingAnalysis analysis) { - String beginDate, endDate; - Long beginYear, endYear; - Integer beginDayOfYear, beginDayOfMonth, beginMonth; - Integer endDayOfYear, endDayOfMonth, endMonth; var result = new HashMap(); // If bounding is actually an instant, set search fields accordingly - if (analysis.getRangeDescriptor() == TimeRangeDescriptor.INSTANT && analysis.getBeginDescriptor() == UNDEFINED) { // distinguished getting begin and end date that were exactly the same (also described as instant) + if (analysis.getRangeDescriptor() == TimeRangeDescriptor.INSTANT && analysis.getBeginDescriptor() == UNDEFINED) { // distinguished getting begin and end date that were exactly the same (also described as instant), but in that case need to use prepareBeginDate and prepareEndDate to get data off the correct analysis fields return prepareDatesForInstant(bounding, analysis); } else { result.putAll(prepareBeginDate(bounding, analysis)); From 27f3b6c17b63948ad009b2bbec6da50858502721 Mon Sep 17 00:00:00 2001 From: Zeb Date: Tue, 30 Jun 2020 15:01:16 -0600 Subject: [PATCH 27/29] Minor cleanup --- .../org/cedar/onestop/indexer/util/TransformationUtils.java | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java index e8bf6aaf4..c7ef47b89 100644 --- a/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java +++ b/indexer/src/main/java/org/cedar/onestop/indexer/util/TransformationUtils.java @@ -18,10 +18,6 @@ import static org.cedar.schemas.avro.psi.ValidDescriptor.UNDEFINED; import static org.cedar.schemas.avro.psi.ValidDescriptor.VALID; -import org.cedar.onestop.kafka.common.util.DataUtils; - -// TODO import org.apache.kafka.streams.StreamsBuilder; - /** * This class contains utilities for transforming the contents of the Avro (schemas) records into the appropriate * corresponding Elasticsearch mapping format. @@ -65,7 +61,7 @@ public static Map reformatMessageForAnalysis(ParsedRecord record public static Map prepareIdentification(IdentificationAnalysis identification, RecordType recordType) { var result = new HashMap(); - var analysis = AvroUtils.avroToMap(identification); // TODO using map because I need javadocs on the IdentificationAnalysis object... + var analysis = AvroUtils.avroToMap(identification); // currently using map because couldn't get it working with IdentificationAnalysis object. Worth revisiting at some point. if (analysis == null) { return result; From af15fa46d3f9bf1bff6ccd6594be18b11ddc9eed Mon Sep 17 00:00:00 2001 From: Zeb Date: Tue, 30 Jun 2020 15:28:20 -0600 Subject: [PATCH 28/29] Added dashboard for errors (requires manual import at the moment) --- .../src/main/resources/dashboards/errors.ndjson | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 elastic-common/src/main/resources/dashboards/errors.ndjson diff --git a/elastic-common/src/main/resources/dashboards/errors.ndjson b/elastic-common/src/main/resources/dashboards/errors.ndjson new file mode 100644 index 000000000..5c3152530 --- /dev/null +++ b/elastic-common/src/main/resources/dashboards/errors.ndjson @@ -0,0 +1,10 @@ +{"attributes":{"fields":"[{\"name\":\"_id\",\"type\":\"string\",\"esTypes\":[\"_id\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_index\",\"type\":\"string\",\"esTypes\":[\"_index\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"_score\",\"type\":\"number\",\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_source\",\"type\":\"_source\",\"esTypes\":[\"_source\"],\"count\":0,\"scripted\":false,\"searchable\":false,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"_type\",\"type\":\"string\",\"esTypes\":[\"_type\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":false},{\"name\":\"dataAccess.dataAccessExists\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"description.descriptionCharacters\",\"type\":\"number\",\"esTypes\":[\"short\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"description.descriptionExists\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"description.descriptionFleschKincaidReadingGradeLevel\",\"type\":\"number\",\"esTypes\":[\"half_float\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"description.descriptionFleschReadingEaseScore\",\"type\":\"number\",\"esTypes\":[\"half_float\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"errors.detail\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false,\"subType\":{\"nested\":{\"path\":\"errors\"}}},{\"name\":\"errors.source\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"nested\":{\"path\":\"errors\"}}},{\"name\":\"errors.title\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"nested\":{\"path\":\"errors\"}}},{\"name\":\"identification.doiExists\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"identification.doiString\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"identification.doiString.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"identification.doiString\"}}},{\"name\":\"identification.fileIdentifierExists\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"identification.fileIdentifierString\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"identification.fileIdentifierString.keyword\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true,\"subType\":{\"multi\":{\"parent\":\"identification.fileIdentifierString\"}}},{\"name\":\"identification.hierarchyLevelNameExists\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"identification.isGranule\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"identification.parentIdentifierExists\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"identification.parentIdentifierString\",\"type\":\"string\",\"esTypes\":[\"text\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":false,\"readFromDocValues\":false},{\"name\":\"internalParentIdentifier\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"spatialBounding.isValid\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"spatialBounding.spatialBoundingExists\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"spatialBounding.validationError\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"stagedDate\",\"type\":\"date\",\"esTypes\":[\"date\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.beginDescriptor\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.beginIndexable\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.beginPrecision\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.beginUtcDateTimeString\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.beginZoneSpecified\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.endDescriptor\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.endIndexable\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.endPrecision\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.endUtcDateTimeString\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.endZoneSpecified\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.instantDescriptor\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.instantIndexable\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.instantPrecision\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.instantUtcDateTimeString\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.instantZoneSpecified\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"temporalBounding.rangeDescriptor\",\"type\":\"string\",\"esTypes\":[\"keyword\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"thumbnail.thumbnailExists\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"titles.alternateTitleCharacters\",\"type\":\"number\",\"esTypes\":[\"short\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"titles.alternateTitleExists\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"titles.alternateTitleFleschKincaidReadingGradeLevel\",\"type\":\"number\",\"esTypes\":[\"half_float\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"titles.alternateTitleFleschReadingEaseScore\",\"type\":\"number\",\"esTypes\":[\"half_float\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"titles.titleCharacters\",\"type\":\"number\",\"esTypes\":[\"short\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"titles.titleExists\",\"type\":\"boolean\",\"esTypes\":[\"boolean\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"titles.titleFleschKincaidReadingGradeLevel\",\"type\":\"number\",\"esTypes\":[\"half_float\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true},{\"name\":\"titles.titleFleschReadingEaseScore\",\"type\":\"number\",\"esTypes\":[\"half_float\"],\"count\":0,\"scripted\":false,\"searchable\":true,\"aggregatable\":true,\"readFromDocValues\":true}]","timeFieldName":"stagedDate","title":"analysis_error*"},"id":"80f1f940-af44-11ea-a83e-2ff43ec9c891","migrationVersion":{"index-pattern":"7.6.0"},"references":[],"type":"index-pattern","updated_at":"2020-06-16T22:08:50.511Z","version":"WzExMjksMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Mismatched Identifiers [OneStop] [Indexing Errors]","uiStateJSON":"{}","version":1,"visState":"{\"title\":\"Mismatched Identifiers [OneStop] [Indexing Errors]\",\"type\":\"metric\",\"params\":{\"metric\":{\"percentageMode\":false,\"useRanges\":false,\"colorSchema\":\"Green to Red\",\"metricColorMode\":\"Labels\",\"colorsRange\":[{\"type\":\"range\",\"from\":0,\"to\":1},{\"type\":\"range\",\"from\":1,\"to\":10000}],\"labels\":{\"show\":true},\"invertColors\":false,\"style\":{\"bgFill\":\"#000\",\"bgColor\":false,\"labelColor\":true,\"subText\":\"\",\"fontSize\":60}},\"dimensions\":{\"metrics\":[{\"type\":\"vis_dimension\",\"accessor\":1,\"format\":{\"id\":\"number\",\"params\":{}}}],\"bucket\":{\"type\":\"vis_dimension\",\"accessor\":0,\"format\":{\"id\":\"string\",\"params\":{}}}},\"addTooltip\":true,\"addLegend\":false,\"type\":\"metric\"},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"filters\",\"schema\":\"group\",\"params\":{\"filters\":[{\"input\":{\"query\":\"identification.matchesIdentifiers:false\",\"language\":\"lucene\"},\"label\":\"Mismatched Identifiers\"}]}}]}"},"id":"061539c0-af45-11ea-a83e-2ff43ec9c891","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"80f1f940-af44-11ea-a83e-2ff43ec9c891","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-06-16T22:08:50.511Z","version":"WzExMzMsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Missing Identifiers [OneStop] [Indexing Errors]","uiStateJSON":"{}","version":1,"visState":"{\"title\":\"Missing Identifiers [OneStop] [Indexing Errors]\",\"type\":\"metric\",\"params\":{\"metric\":{\"percentageMode\":false,\"useRanges\":false,\"colorSchema\":\"Green to Red\",\"metricColorMode\":\"Labels\",\"colorsRange\":[{\"type\":\"range\",\"from\":0,\"to\":1},{\"type\":\"range\",\"from\":1,\"to\":10000}],\"labels\":{\"show\":true},\"invertColors\":false,\"style\":{\"bgFill\":\"#000\",\"bgColor\":false,\"labelColor\":true,\"subText\":\"\",\"fontSize\":60}},\"dimensions\":{\"metrics\":[{\"type\":\"vis_dimension\",\"accessor\":1,\"format\":{\"id\":\"number\",\"params\":{}}}],\"bucket\":{\"type\":\"vis_dimension\",\"accessor\":0,\"format\":{\"id\":\"string\",\"params\":{}}}},\"addTooltip\":true,\"addLegend\":false,\"type\":\"metric\"},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"filters\",\"schema\":\"group\",\"params\":{\"filters\":[{\"input\":{\"query\":\"identification.fileIdentifierExists:false AND identification.doiExists:false\",\"language\":\"lucene\"},\"label\":\"Missing Identifiers\"}]}}]}"},"id":"043cb3c0-af46-11ea-a83e-2ff43ec9c891","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"80f1f940-af44-11ea-a83e-2ff43ec9c891","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-06-16T22:08:50.511Z","version":"WzExMzQsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Number of Analyzed Records [OneStop] [Indexing Errors]","uiStateJSON":"{}","version":1,"visState":"{\"title\":\"Number of Analyzed Records [OneStop] [Indexing Errors]\",\"type\":\"metric\",\"params\":{\"metric\":{\"percentageMode\":false,\"useRanges\":false,\"colorSchema\":\"Green to Red\",\"metricColorMode\":\"None\",\"colorsRange\":[{\"type\":\"range\",\"from\":0,\"to\":10000}],\"labels\":{\"show\":true},\"invertColors\":false,\"style\":{\"bgFill\":\"#000\",\"bgColor\":false,\"labelColor\":false,\"subText\":\"\",\"fontSize\":60}},\"dimensions\":{\"metrics\":[{\"type\":\"vis_dimension\",\"accessor\":1,\"format\":{\"id\":\"number\",\"params\":{}}}],\"bucket\":{\"type\":\"vis_dimension\",\"accessor\":0,\"format\":{\"id\":\"string\",\"params\":{}}}},\"addTooltip\":true,\"addLegend\":false,\"type\":\"metric\"},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}}]}"},"id":"4d8e76e0-b017-11ea-a83e-2ff43ec9c891","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"80f1f940-af44-11ea-a83e-2ff43ec9c891","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-06-16T22:08:50.511Z","version":"WzExMzIsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Invalid Date [OneStop] [Indexing Errors]","uiStateJSON":"{}","version":1,"visState":"{\"title\":\"Invalid Date [OneStop] [Indexing Errors]\",\"type\":\"metric\",\"params\":{\"metric\":{\"percentageMode\":false,\"useRanges\":false,\"colorSchema\":\"Green to Red\",\"metricColorMode\":\"Labels\",\"colorsRange\":[{\"type\":\"range\",\"from\":0,\"to\":1},{\"type\":\"range\",\"from\":1,\"to\":10000}],\"labels\":{\"show\":true},\"invertColors\":false,\"style\":{\"bgFill\":\"#000\",\"bgColor\":false,\"labelColor\":true,\"subText\":\"\",\"fontSize\":60}},\"dimensions\":{\"metrics\":[{\"type\":\"vis_dimension\",\"accessor\":1,\"format\":{\"id\":\"number\",\"params\":{}}}],\"bucket\":{\"type\":\"vis_dimension\",\"accessor\":0,\"format\":{\"id\":\"string\",\"params\":{}}}},\"addTooltip\":true,\"addLegend\":false,\"type\":\"metric\"},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"filters\",\"schema\":\"group\",\"params\":{\"filters\":[{\"input\":{\"query\":\"temporalBounding.instantIndexable:false OR temporalBounding.endIndexable:false OR temporalBounding.beginIndexable:false\",\"language\":\"lucene\"},\"label\":\"Invalid Date\"}]}}]}"},"id":"52d195a0-af46-11ea-a83e-2ff43ec9c891","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"80f1f940-af44-11ea-a83e-2ff43ec9c891","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-06-16T22:08:50.511Z","version":"WzExMzAsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Invalid Geometry [OneStop] [Indexing Errors]","uiStateJSON":"{}","version":1,"visState":"{\"title\":\"Invalid Geometry [OneStop] [Indexing Errors]\",\"type\":\"metric\",\"params\":{\"metric\":{\"percentageMode\":false,\"useRanges\":false,\"colorSchema\":\"Green to Red\",\"metricColorMode\":\"Labels\",\"colorsRange\":[{\"type\":\"range\",\"from\":0,\"to\":1},{\"type\":\"range\",\"from\":1,\"to\":10000}],\"labels\":{\"show\":true},\"invertColors\":false,\"style\":{\"bgFill\":\"#000\",\"bgColor\":false,\"labelColor\":true,\"subText\":\"\",\"fontSize\":60}},\"dimensions\":{\"metrics\":[{\"type\":\"vis_dimension\",\"accessor\":1,\"format\":{\"id\":\"number\",\"params\":{}}}],\"bucket\":{\"type\":\"vis_dimension\",\"accessor\":0,\"format\":{\"id\":\"string\",\"params\":{}}}},\"addTooltip\":true,\"addLegend\":false,\"type\":\"metric\"},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"filters\",\"schema\":\"group\",\"params\":{\"filters\":[{\"input\":{\"query\":\"spatialBounding.isValid: false AND spatialBounding.spatialBoundingExists: true\",\"language\":\"lucene\"},\"label\":\"Invalid Geometry\"}]}}]}"},"id":"810213f0-af46-11ea-a83e-2ff43ec9c891","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"80f1f940-af44-11ea-a83e-2ff43ec9c891","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-06-16T22:08:50.511Z","version":"WzExMzEsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Record Ids [OneStop]","uiStateJSON":"{\"vis\":{\"params\":{\"sort\":{\"columnIndex\":null,\"direction\":null}}}}","version":1,"visState":"{\"title\":\"Record Ids [OneStop]\",\"type\":\"table\",\"params\":{\"perPage\":10,\"showPartialRows\":false,\"showMetricsAtAllLevels\":false,\"sort\":{\"columnIndex\":null,\"direction\":null},\"showTotal\":false,\"totalFunc\":\"sum\",\"percentageCol\":\"\",\"dimensions\":{\"metrics\":[{\"accessor\":0,\"format\":{\"id\":\"number\"},\"params\":{},\"label\":\"Count\",\"aggType\":\"count\"}],\"buckets\":[]}},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"_id\",\"orderBy\":\"1\",\"order\":\"desc\",\"size\":25,\"otherBucket\":true,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\"}}]}"},"id":"d729de80-b0c6-11ea-84ab-a34d141ed3e3","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"80f1f940-af44-11ea-a83e-2ff43ec9c891","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-06-17T18:17:44.806Z","version":"WzE3MjMsMV0="} +{"attributes":{"description":"","kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"query\":\"\",\"language\":\"lucene\"},\"filter\":[],\"indexRefName\":\"kibanaSavedObjectMeta.searchSourceJSON.index\"}"},"title":"Missing Title [OneStop] [Indexing Errors]","uiStateJSON":"{}","version":1,"visState":"{\"title\":\"Missing Title [OneStop] [Indexing Errors]\",\"type\":\"metric\",\"params\":{\"metric\":{\"percentageMode\":false,\"useRanges\":false,\"colorSchema\":\"Green to Red\",\"metricColorMode\":\"Labels\",\"colorsRange\":[{\"type\":\"range\",\"from\":0,\"to\":1},{\"type\":\"range\",\"from\":1,\"to\":10000}],\"labels\":{\"show\":true},\"invertColors\":false,\"style\":{\"bgFill\":\"#000\",\"bgColor\":false,\"labelColor\":true,\"subText\":\"\",\"fontSize\":60}},\"dimensions\":{\"metrics\":[{\"type\":\"vis_dimension\",\"accessor\":1,\"format\":{\"id\":\"number\",\"params\":{}}}],\"bucket\":{\"type\":\"vis_dimension\",\"accessor\":0,\"format\":{\"id\":\"string\",\"params\":{}}}},\"addTooltip\":true,\"addLegend\":false,\"type\":\"metric\"},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"filters\",\"schema\":\"group\",\"params\":{\"filters\":[{\"input\":{\"query\":\"titles.titleExists:false\",\"language\":\"lucene\"},\"label\":\"Missing Title\"}]}}]}"},"id":"30ec1050-af46-11ea-a83e-2ff43ec9c891","migrationVersion":{"visualization":"7.4.2"},"references":[{"id":"80f1f940-af44-11ea-a83e-2ff43ec9c891","name":"kibanaSavedObjectMeta.searchSourceJSON.index","type":"index-pattern"}],"type":"visualization","updated_at":"2020-06-16T22:08:50.511Z","version":"WzExMzUsMV0="} +{"attributes":{"description":"","hits":0,"kibanaSavedObjectMeta":{"searchSourceJSON":"{\"query\":{\"language\":\"kuery\",\"query\":\"\"},\"filter\":[]}"},"optionsJSON":"{\"hidePanelTitles\":false,\"useMargins\":true}","panelsJSON":"[{\"version\":\"7.6.2\",\"gridData\":{\"x\":0,\"y\":0,\"w\":12,\"h\":9,\"i\":\"295956ed-b142-4eec-abee-783b6f0d30c7\"},\"panelIndex\":\"295956ed-b142-4eec-abee-783b6f0d30c7\",\"embeddableConfig\":{},\"panelRefName\":\"panel_0\"},{\"version\":\"7.6.2\",\"gridData\":{\"x\":12,\"y\":0,\"w\":12,\"h\":9,\"i\":\"92458b33-5178-411f-a5a8-35a2f7244f52\"},\"panelIndex\":\"92458b33-5178-411f-a5a8-35a2f7244f52\",\"embeddableConfig\":{},\"panelRefName\":\"panel_1\"},{\"version\":\"7.6.2\",\"gridData\":{\"x\":24,\"y\":0,\"w\":12,\"h\":9,\"i\":\"ed1dfd4e-d0b2-4771-928b-8df896c74aa3\"},\"panelIndex\":\"ed1dfd4e-d0b2-4771-928b-8df896c74aa3\",\"embeddableConfig\":{},\"panelRefName\":\"panel_2\"},{\"version\":\"7.6.2\",\"gridData\":{\"x\":0,\"y\":9,\"w\":12,\"h\":9,\"i\":\"5dc8ee16-75d9-40a8-8e22-6b0522a7837b\"},\"panelIndex\":\"5dc8ee16-75d9-40a8-8e22-6b0522a7837b\",\"embeddableConfig\":{},\"panelRefName\":\"panel_3\"},{\"version\":\"7.6.2\",\"gridData\":{\"x\":12,\"y\":9,\"w\":12,\"h\":9,\"i\":\"b384c5ff-a3c9-48de-b223-6f2125ea3bcd\"},\"panelIndex\":\"b384c5ff-a3c9-48de-b223-6f2125ea3bcd\",\"embeddableConfig\":{},\"panelRefName\":\"panel_4\"},{\"version\":\"7.6.2\",\"gridData\":{\"x\":24,\"y\":9,\"w\":24,\"h\":18,\"i\":\"df92499f-0e6d-4b2d-a1ba-063a323ca5d8\"},\"panelIndex\":\"df92499f-0e6d-4b2d-a1ba-063a323ca5d8\",\"embeddableConfig\":{},\"panelRefName\":\"panel_5\"},{\"version\":\"7.6.2\",\"gridData\":{\"x\":0,\"y\":18,\"w\":12,\"h\":9,\"i\":\"ccf1e2bf-d986-43e9-aedd-066788044135\"},\"panelIndex\":\"ccf1e2bf-d986-43e9-aedd-066788044135\",\"embeddableConfig\":{},\"panelRefName\":\"panel_6\"}]","timeRestore":false,"title":"Indexing Errors [OneStop]","version":1},"id":"524a6ed0-af47-11ea-a83e-2ff43ec9c891","migrationVersion":{"dashboard":"7.3.0"},"references":[{"id":"061539c0-af45-11ea-a83e-2ff43ec9c891","name":"panel_0","type":"visualization"},{"id":"043cb3c0-af46-11ea-a83e-2ff43ec9c891","name":"panel_1","type":"visualization"},{"id":"4d8e76e0-b017-11ea-a83e-2ff43ec9c891","name":"panel_2","type":"visualization"},{"id":"52d195a0-af46-11ea-a83e-2ff43ec9c891","name":"panel_3","type":"visualization"},{"id":"810213f0-af46-11ea-a83e-2ff43ec9c891","name":"panel_4","type":"visualization"},{"id":"d729de80-b0c6-11ea-84ab-a34d141ed3e3","name":"panel_5","type":"visualization"},{"id":"30ec1050-af46-11ea-a83e-2ff43ec9c891","name":"panel_6","type":"visualization"}],"type":"dashboard","updated_at":"2020-06-17T18:18:36.234Z","version":"WzE3MjYsMV0="} +{"exportedCount":9,"missingRefCount":0,"missingReferences":[]} From d22a0c1bc0ee3a66cf4e08a6665da34a3c91c743 Mon Sep 17 00:00:00 2001 From: Arianna Jakositz Date: Thu, 2 Jul 2020 11:02:28 -0600 Subject: [PATCH 29/29] update to tagged release of schemas --- buildSrc/src/main/kotlin/utils.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildSrc/src/main/kotlin/utils.kt b/buildSrc/src/main/kotlin/utils.kt index 770b24936..8192ce34a 100644 --- a/buildSrc/src/main/kotlin/utils.kt +++ b/buildSrc/src/main/kotlin/utils.kt @@ -41,7 +41,7 @@ object Versions { const val SNAKE_YAML = "1.24" const val REACTOR_BOM = "Dysprosium-SR7" - const val ONESTOP_SCHEMAS: String = "1250-date-parsing-exception-SNAPSHOT" + const val ONESTOP_SCHEMAS: String = "0.6.0" } // data classes