From 333787b2f00335ecefb730f471fb48ead4ba1c4b Mon Sep 17 00:00:00 2001 From: gitnnolabs Date: Mon, 21 Feb 2022 02:14:56 -0300 Subject: [PATCH] Adiciona a capacidade de verificar se existe material suplementar e cadastra no opac_schema MatSuppl --- .../sync_kernel_to_website_operations.py | 72 ++++++++++++++++++- airflow/dags/sync_kernel_to_website.py | 10 ++- requirements.txt | 2 +- 3 files changed, 79 insertions(+), 5 deletions(-) diff --git a/airflow/dags/operations/sync_kernel_to_website_operations.py b/airflow/dags/operations/sync_kernel_to_website_operations.py index 0f4c52f6..cb5d199b 100644 --- a/airflow/dags/operations/sync_kernel_to_website_operations.py +++ b/airflow/dags/operations/sync_kernel_to_website_operations.py @@ -82,7 +82,8 @@ def ArticleFactory( document_order: int, document_xml_url: str, repeated_doc_pids=None, - fetch_document_xml:callable=None, + fetch_document_xml: callable = None, + fetch_documents_manifest: callable = None, ) -> models.Article: """Cria uma instância de artigo a partir dos dados de entrada. @@ -96,6 +97,7 @@ def ArticleFactory( document_order (int): Posição do artigo. document_xml_url (str): URL do XML do artigo fetch_document_xml (callable): Função para obter o XML do Kernel caso + fetch_document_xml (callable): Função para obter o JSON Manifest do Kernel caso necessário. Returns: @@ -508,6 +510,63 @@ def _get_related_articles(xml): for related_dict in sps_package.related_articles: _update_related_articles(article, related_dict) + def _update_suppl_material(document_id, filename, url): + """ + Atualiza os material suplementar. + + Return a suplementary material dict. + + { + "url" : "https://minio.scielo.br/documentstore/2237-9622/d6DyD7CHXbpTJbLq7NQQNdq/5d88e2211c5357e2a9d8caeac2170f4f3d1305d1.pdf" + "filename": "suppl01.pdf" + } + """ + + suppl_data = { + "url": url, + "filename": filename + } + + mat_suppl_entity = models.MatSuppl(**suppl_data) + + try: + # Verifica se é uma atualização. + _article = models.Article.objects.get(_id=document_id) + except models.Article.DoesNotExist as ex: + # Caso não seja uma atualização + return models.MatSuppl(**suppl_data) + else: + # É uma atualização + # Mantém a unicidade da atualização do material suplementar + if mat_suppl_entity not in _article.mat_suppl: + _article.mat_suppl += [mat_suppl_entity] + return _article.mat_suppl + else: + return _article.mat_suppl + + def _get_suppl_material(article, json): + """ + Obtém a lista de material suplementar do JSON do Manifest do Kernel e caso existe atualiza a entidade MatSuppl. + + Tags no XML o material suplementar: ["inline-supplementary-material", "supplementary-material"]: + Supplementary data + + + """ + # check if exist a supplementary_material + logging.info("Checking if exists supplementary material....") + + assets = _nestget(json, "versions", 0, "assets") + suppls = [k for k in assets.keys() if 'suppl' in k] + + if any(suppls): + logging.info("Exists supplementary material: %s" % + (' '.join(suppls))) + for key, asset in assets.items(): + if key in suppls: + return _update_suppl_material(article, + filename=key, url=_nestget(asset, 0, 1)) + article.authors = list(_get_article_authors(data)) article.authors_meta = _get_article_authors_meta(data) article.languages = list(_get_languages(data)) @@ -556,6 +615,11 @@ def _get_related_articles(xml): article.order = _get_order(document_order, article.pid) article.xml = document_xml_url + # Cadastra o material suplementar + if fetch_documents_manifest: + json = fetch_documents_manifest(document_id) + article.mat_suppl = _get_suppl_material(document_id, json) + # Se for uma errata ou retratação ou adendo. if article.type in ["correction", "retraction", "addendum"]: # Obtém o XML da errada no kernel @@ -576,7 +640,8 @@ def try_register_documents( get_relation_data: callable, fetch_document_front: callable, article_factory: callable, - fetch_document_xml: callable, + fetch_document_xml: callable = None, + fetch_documents_manifest: callable = None, ) -> List[str]: """Registra documentos do Kernel na base de dados do `OPAC`. @@ -629,7 +694,8 @@ def try_register_documents( item.get("order"), document_xml_url, repeated_doc_pids, - fetch_document_xml + fetch_document_xml, + fetch_documents_manifest ) document.save() logging.info("ARTICLE saved %s %s" % (document_id, issue_id)) diff --git a/airflow/dags/sync_kernel_to_website.py b/airflow/dags/sync_kernel_to_website.py index 2eba4fcd..a49fafd7 100644 --- a/airflow/dags/sync_kernel_to_website.py +++ b/airflow/dags/sync_kernel_to_website.py @@ -162,6 +162,13 @@ def fetch_documents_xml(document_id): return fetch_data("/documents/%s" % (document_id), json=False) +def fetch_documents_manifest(document_id): + """ + Obtém o XML do Document do Kernel com base no parametro 'document_id' + """ + return fetch_data("/documents/%s/manifest" % (document_id), json=True) + + def _get_relation_data_from_kernel_bundle(document_id, front_data=None): """ Obtém os dados do documento no bundle @@ -747,7 +754,7 @@ def _get_known_documents(**kwargs) -> Dict[str, List[str]]: ) orphans = try_register_documents( - documents_to_get, _get_relation_data, fetch_documents_front, ArticleFactory, fetch_documents_xml, + documents_to_get, _get_relation_data, fetch_documents_front, ArticleFactory, fetch_documents_xml, fetch_documents_manifest, ) Variable.set("orphan_documents", orphans, serialize_json=True) @@ -916,6 +923,7 @@ def register_last_issues(ds, **kwargs): except AttributeError: logging.info("No issues are registered to models.Journal: %s " % journal) + def must_send_email(ds, **kwargs): """If IS_SPORADIC == True return False to avoid send e-mail, but if IS_SPORADIC == False, return True to send e-mail. diff --git a/requirements.txt b/requirements.txt index e00a57f3..8139f08e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,6 @@ deepdiff[murmur]==4.0.7 feedparser==5.2.1 beautifulsoup4==4.9.0 git+https://github.com/scieloorg/xylose.git@1.35.8#egg=xylose -git+https://github.com/scieloorg/opac_schema.git@v2.60#egg=opac_schema +git+https://github.com/scieloorg/opac_schema.git@v2.65#egg=opac_schema git+https://github.com/scieloorg/packtools.git@2.6.4#egg=packtools aiohttp==3.6.2 \ No newline at end of file