Skip to content

Commit

Permalink
script for creating tmplate ctexts
Browse files Browse the repository at this point in the history
Signed-off-by: Rieks <RieksJ@users.noreply.github.com>
  • Loading branch information
RieksJ committed Jul 23, 2023
1 parent f7cce61 commit 4260363
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 115 deletions.
Empty file added docs/tev2/23.2
Empty file.
3 changes: 2 additions & 1 deletion docs/tev2/saf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ scope:
scopedir: https://github.com/tno-terminology-design/tev2-specifications/tree/master/docs/tev2 # URL of the scope-directory
curatedir: terms # directory where all curated files are located. Full URL is `scopedir`/`curatedir`
glossarydir: glossaries # directory where all glossary files and GDFs are located. Full URL is `scopedir`/`glossarydir`
website: https://tno-terminology-design.github.io/tev2-specifications/docs/tev2 # base URL for creating links to rendered versions of Curated Texts. It should also serve as the home page of the Tterminology.
navpath: /terms # Path to the directory where Curated Texts are rendered. What `curatedir` is for Curated Texts is, `navpath` is for the rendered versions of Curated Texts.
defaultvsn: latest # vsntag that identifies the default terminology. MRG is located at `scopedir`/`glossarydir`/mrg.tev2.latest.yaml
license: LICENSE.md # file that contains the (default) licensing conditions. Full URL is `scopedir`/`license`
statuses: [ proposed, approved, deprecated ] # list of status tags that are defined for terminological artifacts in this scope
issues: https://github.com/tno-terminology-design/tev2-specifications/issues # URL where issues can be raised and handled
website: https://tno-terminology-design.github.io/tev2-specifications/docs/tev2 # base URL for creating links to rendered versions of Curated Texts. It should also serve as the home page of the Tterminology.
curators: # contacting individual curators
- name: RieksJ
email: # we split up the email address to reduce the likelihood of the address being harvested for spamming
Expand Down
99 changes: 99 additions & 0 deletions docs/tev2/unique.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import os
import pandas as pd
import shutil

def process_excel_row(row):
# Capitalize the first character of every word in 'original'
original = ' '.join(word.capitalize() for word in row['original'].split())

# Replace empty or None values with empty string
termid = str(row['termid']) or '' # Convert to string and handle NaN
formphrase = str(row['formphrase']) or '' # Convert to string and handle NaN
synonymOf = str(row['synonymOf']) or '' # Convert to string and handle NaN
grouptags = str(row['grouptags']) or '' # Convert to string and handle NaN

today = pd.Timestamp.today().strftime('%Y%m%d')

# Filenames of files to process and to write
term_file_path = os.path.join('terms', f'{termid}.md')
new_term_file_path = os.path.join('terms', f'_{termid}.md')

# Check if a term file already exists for the given termid
if os.path.exists(term_file_path):
# Read the existing content of the term file
with open(term_file_path, 'r', encoding='utf-8') as f:
content = f.read()

# If 'formphrase' is not empty, replace the 'formPhrases' line in the frontmatter
if formphrase.strip():
frontmatter_start = content.find('---') + 3
frontmatter_end = content.find('---', frontmatter_start)
frontmatter = content[frontmatter_start:frontmatter_end].strip()
frontmatter_lines = frontmatter.split('\n')
updated_frontmatter_lines = [line.strip() if line.startswith('formPhrases:') else line for line in frontmatter_lines]
updated_frontmatter_lines.append(f'formPhrases: {formphrase}')
updated_frontmatter = '\n'.join(updated_frontmatter_lines)
content = content[:frontmatter_start] + updated_frontmatter + content[frontmatter_end:]

# Create the new term file with updated content
with open(new_term_file_path, 'w', encoding='utf-8') as f:
f.write(content)
else:
# Create a new term file with the specified template and replace variable placeholders
template = f"""---
# Docusaurus header
id: {termid}
# TEv2 Curated Text Header
term: {termid}
termType: concept
isa:
glossaryTerm: {original}
glossaryText: "glossary-text for '{{original}}'."
hoverText: "hover-text for '{{original}}'."
synonymOf: {synonymOf}
grouptags:
formPhrases: {formphrase}
# Curation status
status: proposed
created: {today}
updated: {today}
# Origins/Acknowledgements
contributors: RieksJ
attribution: "[TNO Terminology Design](https://tno-terminology-design.github.io/tev2-specifications/docs/tev2)"
originalLicense: "[CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/?ref=chooser-v1)"
---
# {original}
:::caution
The entire section on Terminology Engine v 2 (TEv2) is still under construction.<br/>
As TEv2 is not (yet) available, the texts that specify the tool are still 'raw', i.e. not yet processed.<br/>[readers](@) will need to see through some (currently unprocessed) notational conventions.
:::
### Summary
:::info Editor's Note
This file has been automatically created; it's header needs to be revised and its contents needs to be properly written.
:::
"""
template = template.replace('{{original}}', original)
template = template.replace('{{termid}}', termid)
template = template.replace('{{formphrase}}', formphrase)
template = template.replace('{{synonymOf}}', synonymOf)
template = template.replace('{{grouptags}}', grouptags)
template = template.replace('{{today}}', today)

with open(new_term_file_path, 'w', encoding='utf-8') as f:
f.write(template)

def main():
# Read the excel file into a DataFrame
excel_file = 'unique_fields.xlsx'
df = pd.read_excel(excel_file)

# Process each row in the DataFrame (excluding the header)
for _, row in df[1:].iterrows():
process_excel_row(row)

if __name__ == '__main__':
main()
114 changes: 0 additions & 114 deletions docs/tev2/unique_fields.csv

This file was deleted.

Binary file added docs/tev2/unique_fields.xlsx
Binary file not shown.
56 changes: 56 additions & 0 deletions docs/tev2/unique_py_specs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
I want you to help me write a python script, called `unique.py`.
The directory from which that script is called contains a a file `unique_fields.xlsx` (excel format) with five columns. The header row specifies the names: the first column is called `original`, the third column `termid`, the fourth `formphrase`, and the fifth `synonymOf`.
In that directory, there is a subdirectory `terms` that contains markdown files with some yaml frontmatter, and a markdown body. I use `terms directory` to refer to that directory. I use `term file` to refer to a markdown file therein.

The script should process every line of the excel file (ignoring the first line, as it is the header line). In the instructions that follow, I use
- `{today}` to refer to the text in `yyyymmdd` format that represents today's date which I expect you to compute;
- `{<columnname>}` as the name of a variable that holds the contents of the row whose header is `<columnname>` (for example, `{termid}` would be variable that holds the contents of the cell in the row that is processed that is in the column named `termid`).
- `{original}` is the name of a variable tht holds the contents of the cell in the column named `original`, where its contents has been modified by capitalizing the first characgter of every word therein

When reading cells from the excel file, make sure that variables end up being proper texts, so:
- cells must be read to produce texts, not floating point numbers or whatever;
- cells that contain errors, are empty, void, null, NaN or similar, are considered to contain an empty string.

Processing of a line of the excel file is as follows. First, check if there is a term file (in the terms directory) whose name is `{termid}.md`, and create a new file called `_{termid}.md` (in the same directory), to which the end-result of the processing will be written. Then,
1. if there is such a `{termid}.md` file, then the result of the processing will be a copy of its contents, with the following modifications:
- if `{formphrase}` contains non-whitespace characters, the line in the frontmatter that starts with `formPhrases`, must be replaced with a line that only contains the text `formPhrases: {formphrase}`.
- if `{grouptags}` contains non-whitespace characters, its contents must be considered as a comma-separated list of words (each of which is a 'grouptag'). The line in the frontmatter that starts with `grouptags:` and that is followed with a comma separated list of words must then be modified such that every grouptag that is not a word therin, is appended to the list of words, such that the list of words remains a comma-separated word list.
2. if there is no such file, the result is the creation of a new one using the text I specify below, which is all text (including the comments) between the two occurrences of `~~~`, and replace every occurrence of a variable name (e.g., `{termid}`) with its value:

~~~
---
# Docusaurus header
id: {termid}
# TEv2 Curated Text Header
term: {termid}
termType: concept
isa:
glossaryTerm: {original}
glossaryText: "glossary-text for `{original}`."
hoverText: "hover-text for `{original}`."
synonymOf: {synonymOf}
grouptags:
formPhrases: {formphrase}
# Curation status
status: proposed
created: {today}
updated: {today}
# Origins/Acknowledgements
contributors: RieksJ
attribution: "[TNO Terminology Design](https://tno-terminology-design.github.io/tev2-specifications/docs/tev2)"
originalLicense: "[CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/?ref=chooser-v1)"
---

# {original}

:::caution
The entire section on Terminology Engine v 2 (TEv2) is still under construction.<br/>
As TEv2 is not (yet) available, the texts that specify the tool are still 'raw', i.e. not yet processed.<br/>[readers](@) will need to see through some (currently unprocessed) notational conventions.
:::

### Summary

:::info Editor's Note
This file has been automatically created; it's header needs to be revised and its contents needs to be properly written.
:::
~~~

0 comments on commit 4260363

Please sign in to comment.