-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: Rieks <RieksJ@users.noreply.github.com>
- Loading branch information
Showing
6 changed files
with
157 additions
and
115 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
import os | ||
import pandas as pd | ||
import shutil | ||
|
||
def process_excel_row(row): | ||
# Capitalize the first character of every word in 'original' | ||
original = ' '.join(word.capitalize() for word in row['original'].split()) | ||
|
||
# Replace empty or None values with empty string | ||
termid = str(row['termid']) or '' # Convert to string and handle NaN | ||
formphrase = str(row['formphrase']) or '' # Convert to string and handle NaN | ||
synonymOf = str(row['synonymOf']) or '' # Convert to string and handle NaN | ||
grouptags = str(row['grouptags']) or '' # Convert to string and handle NaN | ||
|
||
today = pd.Timestamp.today().strftime('%Y%m%d') | ||
|
||
# Filenames of files to process and to write | ||
term_file_path = os.path.join('terms', f'{termid}.md') | ||
new_term_file_path = os.path.join('terms', f'_{termid}.md') | ||
|
||
# Check if a term file already exists for the given termid | ||
if os.path.exists(term_file_path): | ||
# Read the existing content of the term file | ||
with open(term_file_path, 'r', encoding='utf-8') as f: | ||
content = f.read() | ||
|
||
# If 'formphrase' is not empty, replace the 'formPhrases' line in the frontmatter | ||
if formphrase.strip(): | ||
frontmatter_start = content.find('---') + 3 | ||
frontmatter_end = content.find('---', frontmatter_start) | ||
frontmatter = content[frontmatter_start:frontmatter_end].strip() | ||
frontmatter_lines = frontmatter.split('\n') | ||
updated_frontmatter_lines = [line.strip() if line.startswith('formPhrases:') else line for line in frontmatter_lines] | ||
updated_frontmatter_lines.append(f'formPhrases: {formphrase}') | ||
updated_frontmatter = '\n'.join(updated_frontmatter_lines) | ||
content = content[:frontmatter_start] + updated_frontmatter + content[frontmatter_end:] | ||
|
||
# Create the new term file with updated content | ||
with open(new_term_file_path, 'w', encoding='utf-8') as f: | ||
f.write(content) | ||
else: | ||
# Create a new term file with the specified template and replace variable placeholders | ||
template = f"""--- | ||
# Docusaurus header | ||
id: {termid} | ||
# TEv2 Curated Text Header | ||
term: {termid} | ||
termType: concept | ||
isa: | ||
glossaryTerm: {original} | ||
glossaryText: "glossary-text for '{{original}}'." | ||
hoverText: "hover-text for '{{original}}'." | ||
synonymOf: {synonymOf} | ||
grouptags: | ||
formPhrases: {formphrase} | ||
# Curation status | ||
status: proposed | ||
created: {today} | ||
updated: {today} | ||
# Origins/Acknowledgements | ||
contributors: RieksJ | ||
attribution: "[TNO Terminology Design](https://tno-terminology-design.github.io/tev2-specifications/docs/tev2)" | ||
originalLicense: "[CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/?ref=chooser-v1)" | ||
--- | ||
# {original} | ||
:::caution | ||
The entire section on Terminology Engine v 2 (TEv2) is still under construction.<br/> | ||
As TEv2 is not (yet) available, the texts that specify the tool are still 'raw', i.e. not yet processed.<br/>[readers](@) will need to see through some (currently unprocessed) notational conventions. | ||
::: | ||
### Summary | ||
:::info Editor's Note | ||
This file has been automatically created; it's header needs to be revised and its contents needs to be properly written. | ||
::: | ||
""" | ||
template = template.replace('{{original}}', original) | ||
template = template.replace('{{termid}}', termid) | ||
template = template.replace('{{formphrase}}', formphrase) | ||
template = template.replace('{{synonymOf}}', synonymOf) | ||
template = template.replace('{{grouptags}}', grouptags) | ||
template = template.replace('{{today}}', today) | ||
|
||
with open(new_term_file_path, 'w', encoding='utf-8') as f: | ||
f.write(template) | ||
|
||
def main(): | ||
# Read the excel file into a DataFrame | ||
excel_file = 'unique_fields.xlsx' | ||
df = pd.read_excel(excel_file) | ||
|
||
# Process each row in the DataFrame (excluding the header) | ||
for _, row in df[1:].iterrows(): | ||
process_excel_row(row) | ||
|
||
if __name__ == '__main__': | ||
main() |
This file was deleted.
Oops, something went wrong.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
I want you to help me write a python script, called `unique.py`. | ||
The directory from which that script is called contains a a file `unique_fields.xlsx` (excel format) with five columns. The header row specifies the names: the first column is called `original`, the third column `termid`, the fourth `formphrase`, and the fifth `synonymOf`. | ||
In that directory, there is a subdirectory `terms` that contains markdown files with some yaml frontmatter, and a markdown body. I use `terms directory` to refer to that directory. I use `term file` to refer to a markdown file therein. | ||
|
||
The script should process every line of the excel file (ignoring the first line, as it is the header line). In the instructions that follow, I use | ||
- `{today}` to refer to the text in `yyyymmdd` format that represents today's date which I expect you to compute; | ||
- `{<columnname>}` as the name of a variable that holds the contents of the row whose header is `<columnname>` (for example, `{termid}` would be variable that holds the contents of the cell in the row that is processed that is in the column named `termid`). | ||
- `{original}` is the name of a variable tht holds the contents of the cell in the column named `original`, where its contents has been modified by capitalizing the first characgter of every word therein | ||
|
||
When reading cells from the excel file, make sure that variables end up being proper texts, so: | ||
- cells must be read to produce texts, not floating point numbers or whatever; | ||
- cells that contain errors, are empty, void, null, NaN or similar, are considered to contain an empty string. | ||
|
||
Processing of a line of the excel file is as follows. First, check if there is a term file (in the terms directory) whose name is `{termid}.md`, and create a new file called `_{termid}.md` (in the same directory), to which the end-result of the processing will be written. Then, | ||
1. if there is such a `{termid}.md` file, then the result of the processing will be a copy of its contents, with the following modifications: | ||
- if `{formphrase}` contains non-whitespace characters, the line in the frontmatter that starts with `formPhrases`, must be replaced with a line that only contains the text `formPhrases: {formphrase}`. | ||
- if `{grouptags}` contains non-whitespace characters, its contents must be considered as a comma-separated list of words (each of which is a 'grouptag'). The line in the frontmatter that starts with `grouptags:` and that is followed with a comma separated list of words must then be modified such that every grouptag that is not a word therin, is appended to the list of words, such that the list of words remains a comma-separated word list. | ||
2. if there is no such file, the result is the creation of a new one using the text I specify below, which is all text (including the comments) between the two occurrences of `~~~`, and replace every occurrence of a variable name (e.g., `{termid}`) with its value: | ||
|
||
~~~ | ||
--- | ||
# Docusaurus header | ||
id: {termid} | ||
# TEv2 Curated Text Header | ||
term: {termid} | ||
termType: concept | ||
isa: | ||
glossaryTerm: {original} | ||
glossaryText: "glossary-text for `{original}`." | ||
hoverText: "hover-text for `{original}`." | ||
synonymOf: {synonymOf} | ||
grouptags: | ||
formPhrases: {formphrase} | ||
# Curation status | ||
status: proposed | ||
created: {today} | ||
updated: {today} | ||
# Origins/Acknowledgements | ||
contributors: RieksJ | ||
attribution: "[TNO Terminology Design](https://tno-terminology-design.github.io/tev2-specifications/docs/tev2)" | ||
originalLicense: "[CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/?ref=chooser-v1)" | ||
--- | ||
|
||
# {original} | ||
|
||
:::caution | ||
The entire section on Terminology Engine v 2 (TEv2) is still under construction.<br/> | ||
As TEv2 is not (yet) available, the texts that specify the tool are still 'raw', i.e. not yet processed.<br/>[readers](@) will need to see through some (currently unprocessed) notational conventions. | ||
::: | ||
|
||
### Summary | ||
|
||
:::info Editor's Note | ||
This file has been automatically created; it's header needs to be revised and its contents needs to be properly written. | ||
::: | ||
~~~ |