From 8478e9f6c0b58954e98187716a4b97464d1c852f Mon Sep 17 00:00:00 2001 From: Samuel Larkin Date: Wed, 10 Jul 2024 11:24:05 -0400 Subject: [PATCH] doc(json): remove documents that are NOT of `news` domain keeping the document's structure --- docs/json.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 8 deletions(-) diff --git a/docs/json.md b/docs/json.md index 5b67cbd..0421ff1 100644 --- a/docs/json.md +++ b/docs/json.md @@ -45,7 +45,6 @@ zcat input.gz \ > output.gz ``` - ## Counting Elements Count the number of entries/sentence pairs that have the `.unparsable` key. @@ -55,7 +54,6 @@ pv Huge.jsonl \ | jq --null-input '[ inputs | select(.unparsable)] | reduce .[] as $item (0; . + 1)' ``` - ## Group by X and Merge Context: after generating `*.scores.json` using `sacrebleu --width=14 reference --metrics bleu chrf ter < translation > scores.json`. @@ -70,8 +68,6 @@ find -type f -name \*scores.json \ | less ``` - - ## Aggregate a Field Given a list of objects where some of them have the same `id` but with a field with different values, aggregate that field for each object. @@ -82,6 +78,7 @@ This happens when you extracted data from `mysql`. ```sh echo -e '{"id":1, "b":[{"c":1}]}{"id":1, "b":[{"c":2}]}' ``` + ``` { "id": 1, @@ -106,12 +103,11 @@ echo -e '{"id":1, "b":[{"c":1}]}{"id":1, "b":[{"c":2}]}' * take the first element and aggregate all of the `b` in a list * return that first element that has been augmented with a list of `b` - - ```sh echo -e '{"id":1, "b":[{"c":1}]}{"id":1, "b":[{"c":2}]}' \ | jq --slurp 'group_by(.id) | .[] | (.[0].b=([.[].b]|flatten)) | .[0]' ``` + ``` { "id": 1, @@ -140,30 +136,33 @@ zcat translation.fr.json.gz \ '[., $src, $ref] | transpose | map(add) | .[]' ``` - ## Flat Files to Structured json When you have multiple flat files that you want to combine into a structured json. *lingua_eng_spa/Tilde-worldbank-1-eng-spa.spa.gz* + ``` SPA 0.9998978843092705 SPA 0.9991979235059277 ``` *lingua_all_languages/Tilde-worldbank-1-eng-spa.spa.gz* + ``` SPA 0.9999975457963204 SPA 0.9847735076254288 ``` *Tilde-worldbank-1-eng-spa.spa.gz* + ``` "Igualmente, hacemos notar la importancia de abordar el problema del hambre y la malnutrición”. "La vida es muy difícil. ``` *Tilde-worldbank-1-eng-spa.eng.gz* + ``` " We also note the importance of addressing hunger and malnutrition.” "[Life] is extremely difficult. @@ -207,7 +206,6 @@ paste \ } ``` - ## XML to json Using [yq](https://github.com/mikefarah/yq/), we can convert a xml document into a json file. @@ -232,3 +230,65 @@ The second object is NOT an array but you need it to be an array to process all ```sh jq '.[] | .seg | (if type == "object" then [.] else . end) | .[]' ``` + +## Filter-out SubObjects + +Given + +```xml + + + + + +

+ Siso's depictions of land, water center new gallery exhibition +

+
+ +

+ Representaciones de la tierra y el agua de Siso centran una nueva exposición +

+
+
+ + +

+ Adapt the old, accommodate the new to solve issue +

+
+ +

+ Adapta lo viejo, incorpora lo nuevo para resolver el problema +

+
+
+
+
+``` + +Remove documents that are NOT of `news` domain keeping the document's structure. + +```sh +~/.local/bin/yq 'del(.dataset.collection.doc[] | select(.["+@domain"] != "news"))' wmttest2024.en-es.xml +``` + +```xml + + + + + +

+ Siso's depictions of land, water center new gallery exhibition +

+
+ +

+ Representaciones de la tierra y el agua de Siso centran una nueva exposición +

+
+
+
+
+```