From 6070d9bf28f71920516af3e36004cbeed155a6b7 Mon Sep 17 00:00:00 2001 From: bjwswang Date: Mon, 22 Jan 2024 09:15:31 +0000 Subject: [PATCH] chore: run pylint locally and fix lint issues Signed-off-by: bjwswang --- .../{lint_test.yaml => chart_lint_test.yaml} | 0 .pylintrc | 17 +- Makefile | 11 + README.md | 6 + config/crd/kustomization.yaml | 2 +- deploy/charts/arcadia/Chart.yaml | 2 +- .../charts/arcadia/templates/dataprocess.yaml | 2 +- deploy/llms/utils.py | 37 +- .../chat_app_english_teacher.py | 35 +- .../chat_using_private_knowledgebase.py | 32 +- pypi/data-processing/src/common/config.py | 60 +- pypi/data-processing/src/common/const.py | 22 +- .../src/common/special_characters.py | 31 +- .../src/controller/data_process_controller.py | 111 +- .../data_store_clients/minio_store_client.py | 62 +- .../data_store_process/minio_store_process.py | 1344 ++++++++--------- .../postgresql_pool_client.py | 115 +- .../data_process_db_operate.py | 149 +- .../data_process_detail_db_operate.py | 288 ++-- .../data_process_detail_preview_db_operate.py | 65 +- .../data_process_document_chunk_db_operate.py | 103 +- .../data_process_document_db_operate.py | 130 +- .../data_process_log_db_operate.py | 71 +- .../data_process_stage_log_db_operate.py | 71 +- .../src/file_handle/common_handle.py | 1322 +++++++--------- .../src/file_handle/csv_handle.py | 146 +- .../src/file_handle/pdf_handle.py | 73 +- .../src/file_handle/word_handle.py | 71 +- pypi/data-processing/src/kube/client.py | 68 +- .../src/kube/custom_resources.py | 3 +- pypi/data-processing/src/kube/dataset_cr.py | 102 +- pypi/data-processing/src/kube/minio_cr.py | 58 +- pypi/data-processing/src/kube/model_cr.py | 99 +- .../data-processing/src/kube/postgresql_cr.py | 32 +- .../src/llm_api_service/base_qa_provider.py | 11 +- .../llm_api_service/qa_provider_open_ai.py | 108 +- .../qa_provider_zhi_pu_ai_online.py | 102 +- .../src/llm_prompt_template/llm_prompt.py | 4 +- .../src/parallel/thread_parallel.py | 30 +- pypi/data-processing/src/server.py | 76 +- .../src/service/data_process_service.py | 913 +++++------ .../src/transform/text/clean_transform.py | 247 ++- .../src/transform/text/privacy_transform.py | 432 +++--- .../src/transform/text/support_type.py | 189 ++- pypi/data-processing/src/utils/class_utils.py | 1 + pypi/data-processing/src/utils/csv_utils.py | 25 +- .../src/utils/date_time_utils.py | 14 +- pypi/data-processing/src/utils/docx_utils.py | 8 +- pypi/data-processing/src/utils/file_utils.py | 15 +- pypi/data-processing/src/utils/json_utils.py | 78 +- pypi/data-processing/src/utils/log_utils.py | 17 +- pypi/data-processing/src/utils/pdf_utils.py | 10 +- pypi/data-processing/src/utils/sanic_utils.py | 33 +- pypi/ragas_once/pyproject.toml | 4 +- pypi/ragas_once/ragas_once/cli.py | 69 +- pypi/ragas_once/ragas_once/wrapper.py | 80 +- pypi/ragas_once/setup.py | 14 +- 57 files changed, 3183 insertions(+), 4037 deletions(-) rename .github/workflows/{lint_test.yaml => chart_lint_test.yaml} (100%) diff --git a/.github/workflows/lint_test.yaml b/.github/workflows/chart_lint_test.yaml similarity index 100% rename from .github/workflows/lint_test.yaml rename to .github/workflows/chart_lint_test.yaml diff --git a/.pylintrc b/.pylintrc index c6dcdc0e8..acc9f4e8b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -52,7 +52,7 @@ ignore=CVS # ignore-list. The regex matches against paths and can be in Posix or Windows # format. Because '\\' represents the directory delimiter on Windows systems, # it can't be used as an escape character. -ignore-paths= +ignore-paths=./deploy,./examples # Files or directories matching the regular expression patterns are skipped. # The regex matches against base names, not paths. The default value ignores @@ -337,7 +337,7 @@ indent-after-paren=4 indent-string=' ' # Maximum number of characters on a single line. -max-line-length=100 +max-line-length=200 # Maximum number of lines in a module. max-module-lines=1000 @@ -427,10 +427,15 @@ disable=raw-checker-failed, file-ignored, suppressed-message, useless-suppression, - deprecated-pragma, - use-symbolic-message-instead, - use-implicit-booleaness-not-comparison-to-string, - use-implicit-booleaness-not-comparison-to-zero + deprecated-pragma,redefined-outer-name, + use-symbolic-message-instead,missing-class-docstring, + missing-module-docstring,too-many-instance-attributes, + logging-fstring-interpolation,too-few-public-methods, + invalid-character-zero-width-space,missing-function-docstring, + duplicate-value,too-many-lines,dangerous-default-value,deprecated-method, + broad-exception-caught,redefined-builtin,c-extension-no-member,too-many-arguments, + too-many-branches,too-many-locals,too-many-statements,f-string-without-interpolation, + consider-using-enumerate, # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/Makefile b/Makefile index 718144d57..2c6c7111a 100644 --- a/Makefile +++ b/Makefile @@ -300,6 +300,17 @@ prepare-push: manifests generate fmt vet gql-gen @go install github.com/swaggo/swag/cmd/swag@latest @swag init -o apiserver/docs . +PYTHON_INDEX_URL ?=https://pypi.mirrors.ustc.edu.cn/simple/ +.PHONY: prepare-push-pypi +prepare-push-pypi: + @echo "install black" + @pip install pylint black isort -i ${PYTHON_INDEX_URL} + @echo "format python code" + @black . + @echo "sort python imports" + @isort . + @echo "run pylint on data-processing" + @pylint --rcfile .pylintrc ./**/*.py # Commands for Data-Processing DATA_PROCESSING_IMAGE ?= kubebb/dp-base diff --git a/README.md b/README.md index 578613610..9b96f22c7 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,12 @@ Go Report Card + + Pylint Card + + + CodeStyle +

diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index b932f0f6b..a1a8b264d 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -15,10 +15,10 @@ resources: - bases/arcadia.kubeagi.k8s.com.cn_applications.yaml - bases/chain.arcadia.kubeagi.k8s.com.cn_llmchains.yaml - bases/chain.arcadia.kubeagi.k8s.com.cn_retrievalqachains.yaml +- bases/chain.arcadia.kubeagi.k8s.com.cn_apichains.yaml - bases/prompt.arcadia.kubeagi.k8s.com.cn_prompts.yaml - bases/retriever.arcadia.kubeagi.k8s.com.cn_knowledgebaseretrievers.yaml - bases/evaluation.arcadia.kubeagi.k8s.com.cn_rags.yaml -- bases/chain.kubeagi.k8s.com.cn_apichains.yaml #+kubebuilder:scaffold:crdkustomizeresource patchesStrategicMerge: diff --git a/deploy/charts/arcadia/Chart.yaml b/deploy/charts/arcadia/Chart.yaml index 3a2151987..25a462ae7 100644 --- a/deploy/charts/arcadia/Chart.yaml +++ b/deploy/charts/arcadia/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: arcadia description: A Helm chart(KubeBB Component) for KubeAGI Arcadia type: application -version: 0.2.16 +version: 0.2.17 appVersion: "0.1.0" keywords: diff --git a/deploy/charts/arcadia/templates/dataprocess.yaml b/deploy/charts/arcadia/templates/dataprocess.yaml index ec5a757b9..65ae38b95 100644 --- a/deploy/charts/arcadia/templates/dataprocess.yaml +++ b/deploy/charts/arcadia/templates/dataprocess.yaml @@ -37,7 +37,7 @@ spec: command: - "/bin/sh" - "-c" - - "python /arcadia_app/data_manipulation/server.py" + - "python /arcadia_app/src/server.py" --- apiVersion: v1 kind: Service diff --git a/deploy/llms/utils.py b/deploy/llms/utils.py index d1e1821d3..165159e16 100644 --- a/deploy/llms/utils.py +++ b/deploy/llms/utils.py @@ -1,6 +1,5 @@ import asyncio import binascii -from collections import defaultdict import contextlib import errno import functools @@ -18,33 +17,22 @@ import tempfile import threading import time -from urllib.parse import urlencode, unquote, urlparse, parse_qsl, urlunparse import warnings +from collections import defaultdict from inspect import signature from pathlib import Path from subprocess import list2cmdline -from typing import ( - TYPE_CHECKING, - Any, - Dict, - Optional, - Sequence, - Tuple, - Union, - Coroutine, - List, - Mapping, -) +from typing import (TYPE_CHECKING, Any, Coroutine, Dict, List, Mapping, + Optional, Sequence, Tuple, Union) +from urllib.parse import parse_qsl, unquote, urlencode, urlparse, urlunparse # Import psutil after ray so the packaged version is used. import psutil -from google.protobuf import json_format - import ray import ray._private.ray_constants as ray_constants -from ray.core.generated.runtime_env_common_pb2 import ( - RuntimeEnvInfo as ProtoRuntimeEnvInfo, -) +from google.protobuf import json_format +from ray.core.generated.runtime_env_common_pb2 import \ + RuntimeEnvInfo as ProtoRuntimeEnvInfo if TYPE_CHECKING: from ray.runtime_env import RuntimeEnv @@ -281,9 +269,8 @@ def get_visible_accelerator_ids() -> Mapping[str, Optional[List[str]]]: to the visible ids.""" from ray._private.accelerators import ( - get_all_accelerator_resource_names, get_accelerator_manager_for_resource, - ) + get_all_accelerator_resource_names) return { accelerator_resource_name: get_accelerator_manager_for_resource( @@ -1535,10 +1522,10 @@ def check_version_info(cluster_metadata): # but for now, we don't have the same python version of fastchat and ray # so allow to use environment variable to overwrite them # make sure the overwritten version will be compatible - if os.getenv('RAY_VERSION') is not None: - version_info = (os.getenv('RAY_VERSION'), version_info[1]) - if os.getenv('PYTHON_VERSION') is not None: - version_info = (version_info[0], os.getenv('PYTHON_VERSION')) + if os.getenv("RAY_VERSION") is not None: + version_info = (os.getenv("RAY_VERSION"), version_info[1]) + if os.getenv("PYTHON_VERSION") is not None: + version_info = (version_info[0], os.getenv("PYTHON_VERSION")) # ---------------------------- KubeAGI end -------------------------------- if version_info != cluster_version_info: diff --git a/examples/app-ui-using-streamlit/chat_app_english_teacher.py b/examples/app-ui-using-streamlit/chat_app_english_teacher.py index 522808837..aa5754de7 100644 --- a/examples/app-ui-using-streamlit/chat_app_english_teacher.py +++ b/examples/app-ui-using-streamlit/chat_app_english_teacher.py @@ -1,15 +1,26 @@ -import streamlit as st -import requests import os +import requests +import streamlit as st + with st.sidebar: - server_url = st.text_input("服务 apiserver 请求地址, 默认为 http://arcadia-apiserver.kubeagi-system.svc:8081/chat", key="url") - conversion_id = st.text_input("如果想继续的话,可以输入上次的conversion_id,留空表示新对话", key="conversion_id") + server_url = st.text_input( + "服务 apiserver 请求地址, 默认为 http://arcadia-apiserver.kubeagi-system.svc:8081/chat", + key="url", + ) + conversion_id = st.text_input( + "如果想继续的话,可以输入上次的conversion_id,留空表示新对话", key="conversion_id" + ) st.title("💬 Chat with kubeagi") st.caption("🚀 A chatbot powered by Kubeagi") if "messages" not in st.session_state: - st.session_state["messages"] = [{"role": "assistant", "content": "Hello, I am English Teacher 🧑‍🏫 From KubeAGI 🤖"}] + st.session_state["messages"] = [ + { + "role": "assistant", + "content": "Hello, I am English Teacher 🧑‍🏫 From KubeAGI 🤖", + } + ] if "first_show" not in st.session_state: st.session_state["first_show"] = True @@ -21,15 +32,23 @@ st.chat_message(msg["role"]).write(msg["content"]) if prompt := st.chat_input(): - response = requests.post(server_url, - json={"query":prompt,"response_mode":"blocking","conversion_id":conversion_id,"app_name":"base-chat-english-teacher", "app_namespace":"kubeagi-system"}) + response = requests.post( + server_url, + json={ + "query": prompt, + "response_mode": "blocking", + "conversion_id": conversion_id, + "app_name": "base-chat-english-teacher", + "app_namespace": "kubeagi-system", + }, + ) st.session_state.messages.append({"role": "user", "content": prompt}) st.chat_message("user").write(prompt) msg = response.json()["message"] conversion_id = response.json()["conversion_id"] if st.session_state["first_show"]: - st.info('这次聊天的 conversion_id 是: '+conversion_id, icon="ℹ️") + st.info("这次聊天的 conversion_id 是: " + conversion_id, icon="ℹ️") st.session_state["first_show"] = False st.session_state.messages.append({"role": "assistant", "content": msg}) diff --git a/examples/app-ui-using-streamlit/chat_using_private_knowledgebase.py b/examples/app-ui-using-streamlit/chat_using_private_knowledgebase.py index 0ada0eff5..ee24531e6 100644 --- a/examples/app-ui-using-streamlit/chat_using_private_knowledgebase.py +++ b/examples/app-ui-using-streamlit/chat_using_private_knowledgebase.py @@ -1,15 +1,23 @@ -import streamlit as st -import requests import os +import requests +import streamlit as st + with st.sidebar: - server_url = st.text_input("服务 apiserver 请求地址, 默认为 http://arcadia-apiserver.kubeagi-system.svc:8081/chat", key="url") - conversion_id = st.text_input("如果想继续的话,可以输入上次的conversion_id,留空表示新对话", key="conversion_id") + server_url = st.text_input( + "服务 apiserver 请求地址, 默认为 http://arcadia-apiserver.kubeagi-system.svc:8081/chat", + key="url", + ) + conversion_id = st.text_input( + "如果想继续的话,可以输入上次的conversion_id,留空表示新对话", key="conversion_id" + ) st.title("💬 Chat with kubeagi") st.caption("🚀 A chatbot powered by Kubeagi") if "messages" not in st.session_state: - st.session_state["messages"] = [{"role": "assistant", "content": "您好,您可以问我任何关于考勤制度的问题,很高心为您服务。"}] + st.session_state["messages"] = [ + {"role": "assistant", "content": "您好,您可以问我任何关于考勤制度的问题,很高心为您服务。"} + ] if "first_show" not in st.session_state: st.session_state["first_show"] = True @@ -21,15 +29,23 @@ st.chat_message(msg["role"]).write(msg["content"]) if prompt := st.chat_input(): - response = requests.post(server_url, - json={"query":prompt,"response_mode":"blocking","conversion_id":conversion_id,"app_name":"chat-with-kaoqin-kb", "app_namespace":"kubeagi-system"}) + response = requests.post( + server_url, + json={ + "query": prompt, + "response_mode": "blocking", + "conversion_id": conversion_id, + "app_name": "chat-with-kaoqin-kb", + "app_namespace": "kubeagi-system", + }, + ) st.session_state.messages.append({"role": "user", "content": prompt}) st.chat_message("user").write(prompt) msg = response.json()["message"] conversion_id = response.json()["conversion_id"] if st.session_state["first_show"]: - st.info('这次聊天的 conversion_id 是: '+conversion_id, icon="ℹ️") + st.info("这次聊天的 conversion_id 是: " + conversion_id, icon="ℹ️") st.session_state["first_show"] = False st.session_state.messages.append({"role": "assistant", "content": msg}) diff --git a/pypi/data-processing/src/common/config.py b/pypi/data-processing/src/common/config.py index 0c06325e8..ec664ee94 100644 --- a/pypi/data-processing/src/common/config.py +++ b/pypi/data-processing/src/common/config.py @@ -15,92 +15,82 @@ import logging import os -import traceback -from pathlib import Path - -import yaml +import log_tag_const from kube import minio_cr, model_cr, postgresql_cr from utils.class_utils import Singleton -from . import log_tag_const - logger = logging.getLogger(__name__) class Config(metaclass=Singleton): """Configuration class to store the env values.""" - + def __init__(self): logger.debug(f"{log_tag_const.CONFIG} start to load config file.") self.__set_property_value() - def __set_property_value(self): """设置属性的值""" # kubernetes # namespace - k8s_pod_namespace = os.getenv('POD_NAMESPACE', 'arcadia') + k8s_pod_namespace = os.getenv("POD_NAMESPACE", "arcadia") self.k8s_pod_namespace = k8s_pod_namespace # config - k8s_default_config = os.getenv('DEFAULT_CONFIG', 'arcadia-config') + k8s_default_config = os.getenv("DEFAULT_CONFIG", "arcadia-config") self.k8s_default_config = k8s_default_config - minio_config = minio_cr.get_minio_config_in_k8s_configmap( - namespace=k8s_pod_namespace, - config_map_name=k8s_default_config + namespace=k8s_pod_namespace, config_map_name=k8s_default_config ) if minio_config is None: minio_config = {} - + # minio access key - self.minio_access_key = minio_config.get('minio_access_key') + self.minio_access_key = minio_config.get("minio_access_key") # minio secret key - self.minio_secret_key = minio_config.get('minio_secret_key') + self.minio_secret_key = minio_config.get("minio_secret_key") # minio api url - self.minio_api_url = minio_config.get('minio_api_url') + self.minio_api_url = minio_config.get("minio_api_url") # minio secure - # if use HTTP, secure = False; + # if use HTTP, secure = False; # if use HTTPS, secure = True; - self.minio_secure = minio_config.get('minio_secure') + self.minio_secure = minio_config.get("minio_secure") # minio data set prefix - self.minio_dataset_prefix = 'dataset' + self.minio_dataset_prefix = "dataset" llm_qa_retry_count = model_cr.get_llm_qa_retry_count_in_k8s_configmap( - namespace=k8s_pod_namespace, - config_map_name=k8s_default_config - ) - + namespace=k8s_pod_namespace, config_map_name=k8s_default_config + ) + if llm_qa_retry_count is None: llm_qa_retry_count = 5 - self.llm_qa_retry_count = int(llm_qa_retry_count) + self.llm_qa_retry_count = int(llm_qa_retry_count) # knowledge # chunk size self.knowledge_chunk_size = 500 # chunk overlap self.knowledge_chunk_overlap = 50 - + # backend PostgreSQL postgresql_config = postgresql_cr.get_postgresql_config_in_k8s_configmap( - namespace=k8s_pod_namespace, - config_map_name=k8s_default_config - ) + namespace=k8s_pod_namespace, config_map_name=k8s_default_config + ) if postgresql_config is None: postgresql_config = {} - + # host - self.pg_host = postgresql_config.get('host') + self.pg_host = postgresql_config.get("host") # port - self.pg_port = postgresql_config.get('port') + self.pg_port = postgresql_config.get("port") # user - self.pg_user = postgresql_config.get('user') + self.pg_user = postgresql_config.get("user") # password - self.pg_password = postgresql_config.get('password') + self.pg_password = postgresql_config.get("password") # database name - self.pg_database = postgresql_config.get('database') + self.pg_database = postgresql_config.get("database") config = Config() diff --git a/pypi/data-processing/src/common/const.py b/pypi/data-processing/src/common/const.py index 8726cc8de..babc37fd9 100644 --- a/pypi/data-processing/src/common/const.py +++ b/pypi/data-processing/src/common/const.py @@ -12,18 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -llm_wait_seconds = 120 +LLM_WAIT_SECONDS = 120 -clean_support_type = [ - 'remove_invisible_characters', - 'space_standardization', - 'remove_garbled_text', - 'traditional_to_simplified', - 'remove_html_tag', - 'remove_emojis' -] -privacy_support_type = [ - 'remove_email', - 'remove_ip_address', - 'remove_number' +CLEAN_SUPPORT_TYPE = [ + "remove_invisible_characters", + "space_standardization", + "remove_garbled_text", + "traditional_to_simplified", + "remove_html_tag", + "remove_emojis", ] +PRIVACY_SUPPORT_TYPE = ["remove_email", "remove_ip_address", "remove_number"] diff --git a/pypi/data-processing/src/common/special_characters.py b/pypi/data-processing/src/common/special_characters.py index ac104881d..24c504b7c 100644 --- a/pypi/data-processing/src/common/special_characters.py +++ b/pypi/data-processing/src/common/special_characters.py @@ -16,9 +16,10 @@ import emoji +# referenced from https://github.com/alibaba/data-juicer/blob/main/data_juicer/ops/common/special_characters.py#L26 + # special characters -MAIN_SPECIAL_CHARACTERS = string.punctuation + string.digits \ - + string.whitespace +MAIN_SPECIAL_CHARACTERS = string.punctuation + string.digits + string.whitespace OTHER_SPECIAL_CHARACTERS = ( "’ “— ™ – •‘œ    ˜ ‚ƒ„’“”–ー一▬…✦�­£​•€«»°·═" "×士^˘⇓↓↑←→()§″′´¿−±∈¢ø‚„½¼¾¹²³―⁃,ˌ¸‹›ʺˈʻ¦‐⠀‰……‑≤≥‖" @@ -34,6 +35,26 @@ # whitespaces in unicode can be found here: # https://en.wikipedia.org/wiki/Whitespace_character VARIOUS_WHITESPACES = { - ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', - ' ', ' ', ' ', ' ', '​', '‌', '‍', '⁠', '', '„' -} \ No newline at end of file + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "​", + "‌", + "‍", + "⁠", + "", + "„", +} diff --git a/pypi/data-processing/src/controller/data_process_controller.py b/pypi/data-processing/src/controller/data_process_controller.py index fd1676591..dad1b08f5 100644 --- a/pypi/data-processing/src/controller/data_process_controller.py +++ b/pypi/data-processing/src/controller/data_process_controller.py @@ -15,8 +15,6 @@ from sanic import Blueprint from sanic.response import json - -from file_handle import pdf_handle from service import data_process_service from transform.text import support_type @@ -24,28 +22,26 @@ data_process = Blueprint("data_process", url_prefix="/") -@data_process.route('list-by-page', methods=['POST']) +@data_process.route("list-by-page", methods=["POST"]) async def list_by_page(request): res = data_process_service.list_by_page( - request.json, - pool=request.app.config['conn_pool'] + request.json, pool=request.app.config["conn_pool"] ) return json(res) -@data_process.route('list-by-count', methods=['POST']) +@data_process.route("list-by-count", methods=["POST"]) async def list_by_count(request): res = data_process_service.list_by_count( - request.json, - pool=request.app.config['conn_pool'] + request.json, pool=request.app.config["conn_pool"] ) return json(res) -@data_process.route('add', methods=['POST']) +@data_process.route("add", methods=["POST"]) async def add(request): """Add a new data process task. - + example for request.json { "name": "小T_test_0201", @@ -63,71 +59,47 @@ async def add(request): "data_process_config_info": [], "creator": "", "namespace": "abc" - } + } """ - res = data_process_service.add( - request.json, - pool=request.app.config['conn_pool'] - ) + res = data_process_service.add(request.json, pool=request.app.config["conn_pool"]) return json(res) -@data_process.route('delete-by-id', methods=['POST']) +@data_process.route("delete-by-id", methods=["POST"]) async def delete_by_id(request): res = data_process_service.delete_by_id( - request.json, - pool=request.app.config['conn_pool'] + request.json, pool=request.app.config["conn_pool"] ) return json(res) -@data_process.route('info-by-id', methods=['POST']) +@data_process.route("info-by-id", methods=["POST"]) async def info_by_id(request): """Get the detail info by id. - + example for request.json { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } """ res = data_process_service.info_by_id( - request.json, - pool=request.app.config['conn_pool'] + request.json, pool=request.app.config["conn_pool"] ) - return json(res) + return json(res) -@data_process.route('text-process-type', methods=['POST']) -async def text_process_type(request): +@data_process.route("text-process-type", methods=["POST"]) +async def text_process_type(_request): """Get the support type for transforming the text content.""" - return json({ - 'status': 200, - 'message': '', - 'data': support_type.get_default_support_types() - }) - -@data_process.route('test', methods=['POST']) -async def test(request): - """Get the support type for transforming the text content.""" - res = pdf_handle.test({ - 'support_type_map': { - 'remove_invisible_characters': 1 - }, - 'data': '“一户一表、水表出户、抄表到户”是指一个家庭用户安装一个计量水表,计量水表安装在住宅的公共部位,供水企业抄表到户,按户计量收费。', - 'file_name': '222', - 'task_id': '111', - 'conn_pool': request.app.config['conn_pool'] - }) - return json({ - 'status': 200, - 'message': '', - 'data': res - }) - -@data_process.route('check-task-name', methods=['POST']) + return json( + {"status": 200, "message": "", "data": support_type.get_default_support_types()} + ) + + +@data_process.route("check-task-name", methods=["POST"]) async def check_task_name(request): """check task name by name and namespace. - + example for request.json { "name": "test", @@ -135,30 +107,30 @@ async def check_task_name(request): } """ res = data_process_service.check_task_name( - request.json, - pool=request.app.config['conn_pool'] + request.json, pool=request.app.config["conn_pool"] ) - return json(res) + return json(res) + -@data_process.route('get-log-info', methods=['POST']) +@data_process.route("get-log-info", methods=["POST"]) async def get_log_info(request): """check task name by name and namespace. - + example for request.json { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } """ res = data_process_service.get_log_info( - request.json, - pool=request.app.config['conn_pool'] + request.json, pool=request.app.config["conn_pool"] ) - return json(res) + return json(res) + -@data_process.route('get-log-by-file-name', methods=['POST']) +@data_process.route("get-log-by-file-name", methods=["POST"]) async def get_log_by_file_name(request): """check task name by name and namespace. - + example for request.json { "id": "01HGWBE48DT3ADE9ZKA62SW4WS", @@ -167,24 +139,19 @@ async def get_log_by_file_name(request): } """ res = data_process_service.get_log_by_file_name( - request.json, - pool=request.app.config['conn_pool'] + request.json, pool=request.app.config["conn_pool"] ) - return json(res) + return json(res) -@data_process.route('retry', methods=['POST']) +@data_process.route("retry", methods=["POST"]) async def retry(request): """check task name by name and namespace. - + example for request.json { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } """ - res = data_process_service.retry( - request.json, - pool=request.app.config['conn_pool'] - ) - return json(res) - + res = data_process_service.retry(request.json, pool=request.app.config["conn_pool"]) + return json(res) diff --git a/pypi/data-processing/src/data_store_clients/minio_store_client.py b/pypi/data-processing/src/data_store_clients/minio_store_client.py index 95328c0e5..cc0f708f0 100644 --- a/pypi/data-processing/src/data_store_clients/minio_store_client.py +++ b/pypi/data-processing/src/data_store_clients/minio_store_client.py @@ -17,12 +17,11 @@ import traceback import urllib3 +from common import log_tag_const +from common.config import config from minio import Minio from minio.commonconfig import Tags from minio.error import S3Error - -from common import log_tag_const -from common.config import config from utils import file_utils logger = logging.getLogger(__name__) @@ -37,16 +36,16 @@ def get_minio_client(): secure=bool(config.minio_secure), http_client=urllib3.PoolManager( timeout=urllib3.Timeout.DEFAULT_TIMEOUT, - cert_reqs='CERT_NONE', + cert_reqs="CERT_NONE", retries=urllib3.Retry( total=5, backoff_factor=0.2, status_forcelist=[500, 502, 503, 504], - ) - ) + ), + ), ) - - + + def download( minio_client, folder_prefix, @@ -63,17 +62,13 @@ def download( file_path = file_utils.get_temp_file_path() # 如果文件夹不存在,则创建 - directory_path = file_path + 'original' + directory_path = file_path + "original" if not os.path.exists(directory_path): os.makedirs(directory_path) - file_path = directory_path + '/' + file_name + file_path = directory_path + "/" + file_name - minio_client.fget_object( - bucket_name, - folder_prefix + '/' + file_name, - file_path - ) + minio_client.fget_object(bucket_name, folder_prefix + "/" + file_name, file_path) def upload_files_to_minio_with_tags( @@ -85,7 +80,7 @@ def upload_files_to_minio_with_tags( data_volumes_file, ): """Upload the files to minio with tags - + local_folder: local folder; minio_bucket: bucket name; minio_prefix: folder prefix; @@ -98,20 +93,23 @@ def upload_files_to_minio_with_tags( # 设置tag信息 tags = Tags(for_object=True) tags["phase"] = "final" - - for root, dirs, files in os.walk(local_folder): + + for root, _, files in os.walk(local_folder): for file in files: local_file_path = os.path.join(root, file) minio_object_name = os.path.join( - minio_prefix, - os.path.relpath(local_file_path, local_folder) + minio_prefix, os.path.relpath(local_file_path, local_folder) ) # 针对QA拆分类型的处理需要加上object_type和object_count标签 - if any(d.get('type') == 'qa_split' for d in support_type): - tags['object_type'] = 'QA' - - filtered = [item['object_count'] for item in data_volumes_file if item['object_name'] == file] + if any(d.get("type") == "qa_split" for d in support_type): + tags["object_type"] = "QA" + + filtered = [ + item["object_count"] + for item in data_volumes_file + if item["object_name"] == file + ] if filtered: tags["object_count"] = str(filtered[0]) @@ -123,10 +121,12 @@ def upload_files_to_minio_with_tags( # 删除本地文件 file_utils.delete_file(local_file_path) except S3Error as ex: - logger.error(''.join([ - f"{log_tag_const.MINIO} Error uploading {minio_object_name} ", - f"to {minio_bucket}. \n{traceback.format_exc()}" - ])) - - - + logger.error( + "".join( + [ + f"{log_tag_const.MINIO} Error uploading {minio_object_name} ", + f"The error is: \n{str(ex)}\n", + f"to {minio_bucket}. \n{traceback.format_exc()}", + ] + ) + ) diff --git a/pypi/data-processing/src/data_store_process/minio_store_process.py b/pypi/data-processing/src/data_store_process/minio_store_process.py index 8ae2a710b..fce9ee798 100644 --- a/pypi/data-processing/src/data_store_process/minio_store_process.py +++ b/pypi/data-processing/src/data_store_process/minio_store_process.py @@ -12,17 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import io import logging -import os import traceback -from pathlib import Path -import pandas as pd -import ujson import ulid - from common import const, log_tag_const from common.config import config from data_store_clients import minio_store_client @@ -33,7 +26,7 @@ data_process_document_db_operate, data_process_log_db_operate, data_process_stage_log_db_operate) -from file_handle import common_handle, csv_handle, pdf_handle, word_handle +from file_handle import common_handle, pdf_handle, word_handle from kube import dataset_cr from utils import date_time_utils, file_utils, json_utils @@ -46,51 +39,50 @@ def text_manipulate( id, ): """Manipulate the text content. - - req_json is a dictionary object. + + req_json is a dictionary object. """ - - namespace = req_json['namespace'] - support_type = req_json['data_process_config_info'] - file_names = req_json['file_names'] + + namespace = req_json["namespace"] + support_type = req_json["data_process_config_info"] + file_names = req_json["file_names"] # 新增数据处理任务日志 log_id = ulid.ulid() insert_log_item = { - 'id': log_id, - 'task_id': id, - 'type': 'NOW', - 'error_msg': '', - 'creator': req_json.get('creator') + "id": log_id, + "task_id": id, + "type": "NOW", + "error_msg": "", + "creator": req_json.get("creator"), } - data_process_log_db_operate.add( - insert_log_item, - pool=pool - ) + data_process_log_db_operate.add(insert_log_item, pool=pool) try: # update the dataset status update_dataset = _update_dateset_status( - namespace=req_json['namespace'], - version_data_set_name=req_json['version_data_set_name'], - reason='processing', - message='Data processing in progress', + namespace=req_json["namespace"], + version_data_set_name=req_json["version_data_set_name"], + reason="processing", + message="Data processing in progress", task_id=id, log_id=log_id, - creator=req_json.get('creator'), - pool=pool + creator=req_json.get("creator"), + pool=pool, ) - if update_dataset['status'] != 200: + if update_dataset["status"] != 200: return update_dataset # minio 数据集统一前缀 minio_dataset_prefix = config.minio_dataset_prefix - folder_prefix = '/'.join([ - minio_dataset_prefix, - req_json['pre_data_set_name'], - req_json['pre_data_set_version'] - ]) + folder_prefix = "/".join( + [ + minio_dataset_prefix, + req_json["pre_data_set_name"], + req_json["pre_data_set_version"], + ] + ) # get a minio client minio_client = minio_store_client.get_minio_client() @@ -99,40 +91,37 @@ def text_manipulate( for file_name in file_names: # 新增文档处理进度信息 document_id = ulid.ulid() - extension = file_utils.get_file_extension(file_name['name']) + extension = file_utils.get_file_extension(file_name["name"]) document_insert_item = { - 'id': document_id, - 'task_id': id, - 'file_name': file_name['name'], - 'status': 'not_start', - 'progress': '0', - 'creator': req_json['creator'], - 'from_source_type': 'MinIO', - 'from_source_path': config.minio_api_url, - 'document_type': extension + "id": document_id, + "task_id": id, + "file_name": file_name["name"], + "status": "not_start", + "progress": "0", + "creator": req_json["creator"], + "from_source_type": "MinIO", + "from_source_path": config.minio_api_url, + "document_type": extension, } - data_process_document_db_operate.add( - document_insert_item, - pool=pool - ) - file_name['document_id']=document_id + data_process_document_db_operate.add(document_insert_item, pool=pool) + file_name["document_id"] = document_id # 文件处理 - task_status = 'process_complete' - error_msg = '' + task_status = "process_complete" + error_msg = "" # 存放每个文件对应的数据量 data_volumes_file = [] - + for item in file_names: result = None - file_name = item['name'] + file_name = item["name"] # 将文件下载到本地 minio_store_client.download( minio_client, bucket_name=namespace, folder_prefix=folder_prefix, - file_name=file_name + file_name=file_name, ) # 新增阶段性日志-开始 @@ -140,1000 +129,883 @@ def text_manipulate( req_json, pool=pool, task_id=id, - document_id=item.get('document_id'), - stage='start' + document_id=item.get("document_id"), + stage="start", ) insert_stage_log_params = { - 'task_id': id, - 'log_id': log_id, - 'file_name': file_name, - 'stage_name': 'start', - 'stage_status': 'success', - 'stage_detail': start_stage_detail.get('data'), - 'creator': req_json.get('creator') + "task_id": id, + "log_id": log_id, + "file_name": file_name, + "stage_name": "start", + "stage_status": "success", + "stage_detail": start_stage_detail.get("data"), + "creator": req_json.get("creator"), } - data_process_stage_log_db_operate.insert( - insert_stage_log_params, - pool=pool - ) + data_process_stage_log_db_operate.insert(insert_stage_log_params, pool=pool) file_extension = file_utils.get_file_extension(file_name) - if file_extension in ['pdf']: + if file_extension in ["pdf"]: # 处理PDF文件 result = pdf_handle.text_manipulate( - chunk_size=req_json.get('chunk_size'), - chunk_overlap=req_json.get('chunk_overlap'), + chunk_size=req_json.get("chunk_size"), + chunk_overlap=req_json.get("chunk_overlap"), file_name=file_name, - document_id=item.get('document_id'), + document_id=item.get("document_id"), support_type=support_type, conn_pool=pool, task_id=id, - create_user=req_json['creator'] + create_user=req_json["creator"], ) - - elif file_extension in ['docx']: + + elif file_extension in ["docx"]: # 处理.docx文件 result = word_handle.docx_text_manipulate( - chunk_size=req_json.get('chunk_size'), - chunk_overlap=req_json.get('chunk_overlap'), + chunk_size=req_json.get("chunk_size"), + chunk_overlap=req_json.get("chunk_overlap"), file_name=file_name, - document_id=item.get('document_id'), + document_id=item.get("document_id"), support_type=support_type, conn_pool=pool, task_id=id, - create_user=req_json['creator'] + create_user=req_json["creator"], ) - + # 将下载的本地文件删除 _remove_local_file(file_name) # 判断是否存在qa拆分 - has_qa_split = any(item.get('type') == 'qa_split' for item in support_type) + has_qa_split = any(item.get("type") == "qa_split" for item in support_type) if result is None: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} The file type is not supported \n", - f"The current file type is: {file_extension}" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} The file type is not supported \n", + f"The current file type is: {file_extension}", + ] + ) + ) # 任务失败 - task_status = 'process_fail' + task_status = "process_fail" error_msg = f"{file_extension} 文件类型不支持" break # 新增阶段性日志-clean - clean_stage_detail=_get_stage_detail( + clean_stage_detail = _get_stage_detail( req_json, pool=pool, task_id=id, - document_id=item.get('document_id'), - stage='clean', - file_name=file_name + document_id=item.get("document_id"), + stage="clean", + file_name=file_name, ) - if clean_stage_detail.get('status') == 200: + if clean_stage_detail.get("status") == 200: insert_stage_log_params = { - 'task_id': id, - 'log_id': log_id, - 'file_name': file_name, - 'stage_name': 'clean', - 'stage_status': 'success', - 'stage_detail': clean_stage_detail.get('data'), - 'creator': req_json.get('creator') + "task_id": id, + "log_id": log_id, + "file_name": file_name, + "stage_name": "clean", + "stage_status": "success", + "stage_detail": clean_stage_detail.get("data"), + "creator": req_json.get("creator"), } data_process_stage_log_db_operate.insert( - insert_stage_log_params, - pool=pool + insert_stage_log_params, pool=pool ) # 新增阶段性日志-privacy - privacy_stage_detail=_get_stage_detail( + privacy_stage_detail = _get_stage_detail( req_json, pool=pool, task_id=id, - document_id=item.get('document_id'), - stage='privacy', - file_name=file_name + document_id=item.get("document_id"), + stage="privacy", + file_name=file_name, ) - if privacy_stage_detail.get('status') == 200: + if privacy_stage_detail.get("status") == 200: insert_stage_log_params = { - 'task_id': id, - 'log_id': log_id, - 'file_name': file_name, - 'stage_name': 'privacy', - 'stage_status': 'success', - 'stage_detail': privacy_stage_detail.get('data'), - 'creator': req_json.get('creator') + "task_id": id, + "log_id": log_id, + "file_name": file_name, + "stage_name": "privacy", + "stage_status": "success", + "stage_detail": privacy_stage_detail.get("data"), + "creator": req_json.get("creator"), } data_process_stage_log_db_operate.insert( - insert_stage_log_params, - pool=pool + insert_stage_log_params, pool=pool ) - - if result.get('status') != 200: + + if result.get("status") != 200: # 任务失败 - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} Data process fail \n", - f"The file name: {file_name}\n", - f"The error is: {result.get('message')}\n" - ])) - task_status = 'process_fail' - error_msg = result.get('message') + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} Data process fail \n", + f"The file name: {file_name}\n", + f"The error is: {result.get('message')}\n", + ] + ) + ) + task_status = "process_fail" + error_msg = result.get("message") # 新增阶段性日志-qa_split if has_qa_split: _get_qa_stage_detail( task_id=id, log_id=log_id, - status='fail', + status="fail", file_name=file_name, - creator=req_json.get('creator'), + creator=req_json.get("creator"), result=result, - pool=pool + pool=pool, ) break - data_volumes_file.append(result['data']) + data_volumes_file.append(result["data"]) # 新增阶段性日志-qa_split if has_qa_split: _get_qa_stage_detail( task_id=id, log_id=log_id, - status='success', + status="success", file_name=file_name, - creator=req_json.get('creator'), + creator=req_json.get("creator"), result=result, - pool=pool + pool=pool, ) # 新增阶段性日志-finish finish_now = date_time_utils.now_str() - finish_stage_detail=f"{finish_now} Task Finished!!!" - + finish_stage_detail = f"{finish_now} Task Finished!!!" + insert_stage_log_params = { - 'task_id': id, - 'log_id': log_id, - 'file_name': file_name, - 'stage_name': 'finish', - 'stage_status': 'success', - 'stage_detail': finish_stage_detail, - 'creator': req_json.get('creator') + "task_id": id, + "log_id": log_id, + "file_name": file_name, + "stage_name": "finish", + "stage_status": "success", + "stage_detail": finish_stage_detail, + "creator": req_json.get("creator"), } - data_process_stage_log_db_operate.insert( - insert_stage_log_params, - pool=pool - ) - + data_process_stage_log_db_operate.insert(insert_stage_log_params, pool=pool) + # insert QA list to detail preview - logger.debug(f"{log_tag_const.MINIO_STORE_PROCESS} Insert QA list for detail preview.") - list_qa_params = { - 'task_id': id - } + logger.debug( + f"{log_tag_const.MINIO_STORE_PROCESS} Insert QA list for detail preview." + ) + list_qa_params = {"task_id": id} list_qa_res = data_process_detail_db_operate.top_n_list_qa_for_preview( - list_qa_params, - pool=pool + list_qa_params, pool=pool ) - for item in list_qa_res.get('data'): - item['transform_type']='qa_split' - item['pre_content']=item['question'] - item['post_content']=item['answer'] - data_process_detail_preview_db_operate.insert( - item, - pool=pool - ) + for item in list_qa_res.get("data"): + item["transform_type"] = "qa_split" + item["pre_content"] = item["question"] + item["post_content"] = item["answer"] + data_process_detail_preview_db_operate.insert(item, pool=pool) # 将清洗后的文件上传到MinIO中 # 上传final文件夹下的文件,并添加tag file_path = file_utils.get_temp_file_path() minio_store_client.upload_files_to_minio_with_tags( minio_client=minio_client, - local_folder=file_path + 'final', + local_folder=file_path + "final", minio_bucket=namespace, minio_prefix=folder_prefix, support_type=support_type, - data_volumes_file=data_volumes_file + data_volumes_file=data_volumes_file, ) # update the dataset status update_dataset = _update_dateset_status( - namespace=req_json['namespace'], - version_data_set_name=req_json['version_data_set_name'], + namespace=req_json["namespace"], + version_data_set_name=req_json["version_data_set_name"], reason=task_status, message=error_msg, task_id=id, log_id=log_id, - creator=req_json.get('creator'), - pool=pool + creator=req_json.get("creator"), + pool=pool, ) - if update_dataset['status'] != 200: + if update_dataset["status"] != 200: return update_dataset # 更新数据处理任务日志 update_log_item = { - 'id': log_id, - 'status': task_status, - 'error_msg': error_msg, - 'creator': req_json['creator'] + "id": log_id, + "status": task_status, + "error_msg": error_msg, + "creator": req_json["creator"], } - data_process_log_db_operate.update_status_by_id( - update_log_item, - pool=pool - ) + data_process_log_db_operate.update_status_by_id(update_log_item, pool=pool) # 数据库更新任务状态 update_params = { - 'id': id, - 'current_log_id': log_id, - 'status': task_status, - 'user': req_json['creator'] + "id": id, + "current_log_id": log_id, + "status": task_status, + "user": req_json["creator"], } - data_process_db_operate.update_status_by_id( - update_params, - pool=pool - ) + data_process_db_operate.update_status_by_id(update_params, pool=pool) - return { - 'status': 200, - 'message': '', - 'data': '' - } + return {"status": 200, "message": "", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} Data process fail \n", - f"{traceback.format_exc()}" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} Data process fail \n", + f"{traceback.format_exc()}", + ] + ) + ) # 更新数据处理任务日志 update_log_item = { - 'id': log_id, - 'status': 'process_fail', - 'error_msg': '未知错误,请联系管理员!', - 'creator': req_json.get('creator') + "id": log_id, + "status": "process_fail", + "error_msg": "未知错误,请联系管理员!", + "creator": req_json.get("creator"), } - data_process_log_db_operate.update_status_by_id( - update_log_item, - pool=pool - ) + data_process_log_db_operate.update_status_by_id(update_log_item, pool=pool) # 数据库更新任务状态 update_params = { - 'id': id, - 'current_log_id': log_id, - 'status': 'process_fail', - 'user': req_json.get('creator') + "id": id, + "current_log_id": log_id, + "status": "process_fail", + "user": req_json.get("creator"), } - data_process_db_operate.update_status_by_id( - update_params, - pool=pool - ) + data_process_db_operate.update_status_by_id(update_params, pool=pool) # update the dataset status _update_dateset_status( - namespace=req_json.get('namespace'), - version_data_set_name=req_json.get('version_data_set_name'), + namespace=req_json.get("namespace"), + version_data_set_name=req_json.get("version_data_set_name"), reason=task_status, message=error_msg, task_id=id, log_id=log_id, - creator=req_json.get('creator'), - pool=pool + creator=req_json.get("creator"), + pool=pool, ) - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} -def text_manipulate_retry( - req_json, - pool -): - task_id = req_json.get('id') - creator = req_json.get('creator') + +def text_manipulate_retry(req_json, pool): + task_id = req_json.get("id") + creator = req_json.get("creator") log_id = ulid.ulid() # 根据id获取任务信息 - task_info = data_process_db_operate.info_by_id( - req_json, - pool=pool - ) - task_info_dict = task_info.get('data')[0] + task_info = data_process_db_operate.info_by_id(req_json, pool=pool) + task_info_dict = task_info.get("data")[0] try: - # 更新任务状态 update_status_res = _update_status_and_log_id( id=task_id, - current_log_id='', - status='processing', - end_datetime='', + current_log_id="", + status="processing", + end_datetime="", creator=creator, - pool=pool + pool=pool, ) - if update_status_res.get('status') != 200: + if update_status_res.get("status") != 200: return update_status_res - + # 新增数据处理任务日志 - log_info = _insert_log_info( - id=log_id, - task_id=task_id, - execute_type='RETRY', - creator=creator, - pool=pool + _insert_log_info( + id=log_id, task_id=task_id, execute_type="RETRY", creator=creator, pool=pool ) # 更新数据集状态 update_dataset = _update_dateset_status( - namespace=task_info_dict.get('namespace'), - version_data_set_name=task_info_dict.get('pre_version_data_set_name'), - reason='processing', - message='Data processing in progress', + namespace=task_info_dict.get("namespace"), + version_data_set_name=task_info_dict.get("pre_version_data_set_name"), + reason="processing", + message="Data processing in progress", task_id=task_id, log_id=log_id, creator=creator, - pool=pool + pool=pool, ) - if update_dataset['status'] != 200: + if update_dataset["status"] != 200: return update_dataset # 根据task_id查询处理未成功的文件 document_list = data_process_document_db_operate.list_by_task_id_and_status( - req_json, - pool=pool + req_json, pool=pool ) - task_status = 'process_complete' - error_msg = '' - if len(document_list.get('data')) > 0: + task_status = "process_complete" + error_msg = "" + if len(document_list.get("data")) > 0: # 文件处理 # 存放每个文件对应的数据量 data_volumes_file = [] - for document in document_list.get('data'): - logger.debug(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} document retry \n", - f"file_name: {document.get('file_name')}" - ])) + for document in document_list.get("data"): + logger.debug( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} document retry \n", + f"file_name: {document.get('file_name')}", + ] + ) + ) result = _text_manipulate_retry_for_document( document=document, task_info=task_info_dict, log_id=log_id, creator=creator, - pool=pool + pool=pool, ) - if result.get('status') != 200: + if result.get("status") != 200: # 任务失败 - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} Data process fail \n", - f"The file name: {document.get('file_name')}\n", - f"The error is: {result.get('message')}\n" - ])) - task_status = 'process_fail' - error_msg = result.get('message') + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} Data process fail \n", + f"The file name: {document.get('file_name')}\n", + f"The error is: {result.get('message')}\n", + ] + ) + ) + task_status = "process_fail" + error_msg = result.get("message") break - data_volumes_file.append(result['data']) + data_volumes_file.append(result["data"]) # 新增阶段性日志-finish - finish_stage_detail=f"{date_time_utils.now_str()} Task Finished!!!" + finish_stage_detail = f"{date_time_utils.now_str()} Task Finished!!!" insert_stage_log_params = { - 'task_id': task_id, - 'log_id': log_id, - 'file_name': '', - 'stage_name': 'finish', - 'stage_status': 'success', - 'stage_detail': finish_stage_detail, - 'creator': creator + "task_id": task_id, + "log_id": log_id, + "file_name": "", + "stage_name": "finish", + "stage_status": "success", + "stage_detail": finish_stage_detail, + "creator": creator, } - data_process_stage_log_db_operate.insert( - insert_stage_log_params, - pool=pool - ) + data_process_stage_log_db_operate.insert(insert_stage_log_params, pool=pool) # insert QA list to detail preview - logger.debug(f"{log_tag_const.MINIO_STORE_PROCESS} Insert QA list for detail preview.") - list_qa_params = { - 'task_id': task_id - } + logger.debug( + f"{log_tag_const.MINIO_STORE_PROCESS} Insert QA list for detail preview." + ) + list_qa_params = {"task_id": task_id} list_qa_res = data_process_detail_db_operate.top_n_list_qa_for_preview( - list_qa_params, - pool=pool + list_qa_params, pool=pool ) - for item in list_qa_res.get('data'): - item['transform_type']='qa_split' - item['pre_content']=item['question'] - item['post_content']=item['answer'] - data_process_detail_preview_db_operate.insert( - item, - pool=pool - ) + for item in list_qa_res.get("data"): + item["transform_type"] = "qa_split" + item["pre_content"] = item["question"] + item["post_content"] = item["answer"] + data_process_detail_preview_db_operate.insert(item, pool=pool) # 将清洗后的文件上传到MinIO中 # 上传final文件夹下的文件,并添加tag file_path = file_utils.get_temp_file_path() minio_dataset_prefix = config.minio_dataset_prefix - folder_prefix = '/'.join([ - minio_dataset_prefix, - task_info_dict['pre_data_set_name'], - task_info_dict['pre_data_set_version'] - ]) + folder_prefix = "/".join( + [ + minio_dataset_prefix, + task_info_dict["pre_data_set_name"], + task_info_dict["pre_data_set_version"], + ] + ) minio_client = minio_store_client.get_minio_client() minio_store_client.upload_files_to_minio_with_tags( minio_client=minio_client, - local_folder=file_path + 'final', - minio_bucket=task_info_dict.get('namespace'), + local_folder=file_path + "final", + minio_bucket=task_info_dict.get("namespace"), minio_prefix=folder_prefix, - support_type=task_info_dict.get('data_process_config_info'), - data_volumes_file=data_volumes_file + support_type=task_info_dict.get("data_process_config_info"), + data_volumes_file=data_volumes_file, ) # 更新数据集状态 update_dataset = _update_dateset_status( - namespace=task_info_dict.get('namespace'), - version_data_set_name=task_info_dict.get('pre_version_data_set_name'), + namespace=task_info_dict.get("namespace"), + version_data_set_name=task_info_dict.get("pre_version_data_set_name"), reason=task_status, message=error_msg, task_id=task_id, log_id=log_id, creator=creator, - pool=pool + pool=pool, ) - if update_dataset['status'] != 200: + if update_dataset["status"] != 200: return update_dataset # 更新数据处理任务日志 update_log_item = { - 'id': log_id, - 'status': task_status, - 'error_msg': error_msg, - 'creator': creator + "id": log_id, + "status": task_status, + "error_msg": error_msg, + "creator": creator, } - data_process_log_db_operate.update_status_by_id( - update_log_item, - pool=pool - ) + data_process_log_db_operate.update_status_by_id(update_log_item, pool=pool) # 数据库更新任务状态 update_params = { - 'id': task_id, - 'current_log_id': log_id, - 'status': task_status, - 'user': creator + "id": task_id, + "current_log_id": log_id, + "status": task_status, + "user": creator, } - data_process_db_operate.update_status_by_id( - update_params, - pool=pool - ) + data_process_db_operate.update_status_by_id(update_params, pool=pool) - return { - 'status': 200, - 'message': '', - 'data': '' - } + return {"status": 200, "message": "", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} Data process fail \n", - f"{traceback.format_exc()}" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} Data process fail \n", + f"{traceback.format_exc()}", + ] + ) + ) # 更新数据处理任务日志 update_log_item = { - 'id': log_id, - 'status': 'process_fail', - 'error_msg': '未知错误,请联系管理员!', - 'creator': creator + "id": log_id, + "status": "process_fail", + "error_msg": "未知错误,请联系管理员!", + "creator": creator, } - data_process_log_db_operate.update_status_by_id( - update_log_item, - pool=pool - ) + data_process_log_db_operate.update_status_by_id(update_log_item, pool=pool) # 数据库更新任务状态 update_params = { - 'id': task_id, - 'current_log_id': log_id, - 'status': 'process_fail', - 'user': creator + "id": task_id, + "current_log_id": log_id, + "status": "process_fail", + "user": creator, } - data_process_db_operate.update_status_by_id( - update_params, - pool=pool - ) + data_process_db_operate.update_status_by_id(update_params, pool=pool) # 更新数据集状态 update_dataset = _update_dateset_status( - namespace=task_info_dict.get('namespace'), - version_data_set_name=task_info_dict.get('pre_version_data_set_name'), - reason='process_fail', - message='未知错误,请联系管理员!', + namespace=task_info_dict.get("namespace"), + version_data_set_name=task_info_dict.get("pre_version_data_set_name"), + reason="process_fail", + message="未知错误,请联系管理员!", task_id=task_id, log_id=log_id, creator=creator, - pool=pool + pool=pool, ) - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} def _remove_local_file(file_name): try: remove_file_path = file_utils.get_temp_file_path() - local_file_path = remove_file_path + 'original/' + file_name + local_file_path = remove_file_path + "original/" + file_name file_utils.delete_file(local_file_path) - return { - 'status': 200, - 'message': '删除成功', - 'data': '' - } + return {"status": 200, "message": "删除成功", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} remove local file fail \n", - f"the error. \n{traceback.format_exc()}" - ])) - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} remove local file fail \n", + f"the error. \n{traceback.format_exc()}", + ] + ) + ) + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} + def _update_dateset_status( - namespace, - version_data_set_name, - reason, - message, - task_id, - log_id, - creator, - pool + namespace, version_data_set_name, reason, message, task_id, log_id, creator, pool ): - logger.debug(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} update dataset status \n", - f"task_id: {task_id}\n", - f"namespace: {namespace}\n", - f"version_data_set_name: {version_data_set_name}\n", - f"reason: {reason}" - ])) + logger.debug( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} update dataset status \n", + f"task_id: {task_id}\n", + f"namespace: {namespace}\n", + f"version_data_set_name: {version_data_set_name}\n", + f"reason: {reason}", + ] + ) + ) update_dataset = dataset_cr.update_dataset_k8s_cr( namespace=namespace, version_data_set_name=version_data_set_name, reason=reason, - message=message + message=message, ) - if update_dataset['status'] != 200: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} update dataset status \n", - f"task_id: {task_id}\n", - f"namespace: {namespace}\n", - f"version_data_set_name: {version_data_set_name}\n", - f"reason: {reason}" - ])) + if update_dataset["status"] != 200: + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} update dataset status \n", + f"task_id: {task_id}\n", + f"namespace: {namespace}\n", + f"version_data_set_name: {version_data_set_name}\n", + f"reason: {reason}", + ] + ) + ) # 更新数据处理任务日志 update_log_item = { - 'id': log_id, - 'status': 'process_fail', - 'error_msg': update_dataset.get('message'), - 'creator': creator + "id": log_id, + "status": "process_fail", + "error_msg": update_dataset.get("message"), + "creator": creator, } - data_process_log_db_operate.update_status_by_id( - update_log_item, - pool=pool - ) + data_process_log_db_operate.update_status_by_id(update_log_item, pool=pool) # 数据库更新任务状态 update_params = { - 'id': task_id, - 'current_log_id': log_id, - 'status': 'process_fail', - 'user': creator + "id": task_id, + "current_log_id": log_id, + "status": "process_fail", + "user": creator, } - data_process_db_operate.update_status_by_id( - update_params, - pool=pool - ) + data_process_db_operate.update_status_by_id(update_params, pool=pool) return update_dataset -def _get_stage_detail( - req_json, - task_id, - document_id, - pool, - stage, - file_name=None -): +def _get_stage_detail(req_json, task_id, document_id, pool, stage, file_name=None): now = date_time_utils.now_str() - stage_detail = '' - operations = req_json.get('data_process_config_info') - - if stage == 'start': - received_task={ - 'task_id': task_id, - 'pre_dataset_name': req_json.get('pre_data_set_name'), - 'pre_dataset_version': req_json.get('pre_data_set_version'), - 'file_names': req_json.get('file_names') + stage_detail = "" + operations = req_json.get("data_process_config_info") + + if stage == "start": + received_task = { + "task_id": task_id, + "pre_dataset_name": req_json.get("pre_data_set_name"), + "pre_dataset_version": req_json.get("pre_data_set_version"), + "file_names": req_json.get("file_names"), } - stage_detail = '\n'.join([ - f"{now} Data Processing Task Starts!!!", - f"Received Task: {json_utils.dumps(received_task)}", - f"Operations: {json_utils.dumps(operations)}" - ]) - elif stage == 'clean': + stage_detail = "\n".join( + [ + f"{now} Data Processing Task Starts!!!", + f"Received Task: {json_utils.dumps(received_task)}", + f"Operations: {json_utils.dumps(operations)}", + ] + ) + elif stage == "clean": clean_stage_detail = _get_stage_detail_for_transform_type( task_id=task_id, document_id=document_id, transform_type=operations, - support_type=const.clean_support_type, - pool=pool + support_type=const.CLEAN_SUPPORT_TYPE, + pool=pool, ) - if clean_stage_detail.get('status') != 200: + if clean_stage_detail.get("status") != 200: return clean_stage_detail - stage_detail='\n'.join([ - f"{now} Current Execution Stage: {stage}, File Name: {file_name}", - f"Current Result: {json_utils.dumps(clean_stage_detail.get('data'))}" - ]) - elif stage == 'privacy': + stage_detail = "\n".join( + [ + f"{now} Current Execution Stage: {stage}, File Name: {file_name}", + f"Current Result: {json_utils.dumps(clean_stage_detail.get('data'))}", + ] + ) + elif stage == "privacy": privacy_stage_detail = _get_stage_detail_for_transform_type( task_id=task_id, document_id=document_id, transform_type=operations, - support_type=const.privacy_support_type, - pool=pool + support_type=const.PRIVACY_SUPPORT_TYPE, + pool=pool, ) - if privacy_stage_detail.get('status') != 200: + if privacy_stage_detail.get("status") != 200: return privacy_stage_detail - stage_detail='\n'.join([ - f"{now} Current Execution Stage: {stage}, File Name: {file_name}", - f"Current Result: {json_utils.dumps(privacy_stage_detail.get('data'))}" - ]) - - return { - 'status': 200, - 'message': '', - 'data': stage_detail - } + stage_detail = "\n".join( + [ + f"{now} Current Execution Stage: {stage}, File Name: {file_name}", + f"Current Result: {json_utils.dumps(privacy_stage_detail.get('data'))}", + ] + ) + + return {"status": 200, "message": "", "data": stage_detail} def _get_stage_detail_for_transform_type( - task_id, - document_id, - transform_type, - support_type, - pool + task_id, document_id, transform_type, support_type, pool ): """获取阶段详情日志""" # 处理结果 - operator_result=[] + operator_result = [] stage_support_type = [] for item in transform_type: - if item.get('type') in support_type: - stage_support_type.append(item.get('type')) + if item.get("type") in support_type: + stage_support_type.append(item.get("type")) if len(stage_support_type) == 0: - return { - 'status': 1000, - 'message': '用户没有选择数据异常清洗', - 'data': '' - } + return {"status": 1000, "message": "用户没有选择数据异常清洗", "data": ""} detail_list = _list_for_transform_type( task_id=task_id, document_id=document_id, transform_type=stage_support_type, - pool=pool + pool=pool, ) - if len(detail_list.get('data')) == 0: + if len(detail_list.get("data")) == 0: for item in stage_support_type: - operator_result.append({ - 'type': item, - 'processed_count': 0 - }) + operator_result.append({"type": item, "processed_count": 0}) return { - 'status': 200, - 'message': '', - 'data': { - 'status': 'sucess', - 'operator_count': len(stage_support_type), - 'operator_result': operator_result - } - } + "status": 200, + "message": "", + "data": { + "status": "success", + "operator_count": len(stage_support_type), + "operator_result": operator_result, + }, + } # 判断是否存在状态为fail的数据 - status='success' - has_fail = any(item.get('status') == 'fail' for item in detail_list.get('data')) + status = "success" + has_fail = any(item.get("status") == "fail" for item in detail_list.get("data")) if has_fail: - status='fail' + status = "fail" for item in stage_support_type: list_for_support_type = _list_for_transform_type( - task_id=task_id, - document_id=document_id, - transform_type=[item], - pool=pool + task_id=task_id, document_id=document_id, transform_type=[item], pool=pool ) # 判断该类型状态为fail的数据 - data_dict = list_for_support_type.get('data') - has_fail = any(item.get('status') == 'fail' for item in data_dict) + data_dict = list_for_support_type.get("data") + has_fail = any(item.get("status") == "fail" for item in data_dict) if has_fail: - operator_result.append({ - 'type': item, - 'processed_count': len(data_dict), - 'message': data_dict[0].get('error_message') - }) + operator_result.append( + { + "type": item, + "processed_count": len(data_dict), + "message": data_dict[0].get("error_message"), + } + ) else: - operator_result.append({ - 'type': item, - 'processed_count': len(data_dict) - }) - + operator_result.append({"type": item, "processed_count": len(data_dict)}) + current_result = { - 'status': status, - 'operator_count': len(stage_support_type), - 'operator_result': operator_result + "status": status, + "operator_count": len(stage_support_type), + "operator_result": operator_result, } - return { - 'status': 200, - 'message': '', - 'data': current_result - } - -def _get_qa_stage_detail( - task_id, - log_id, - status, - file_name, - creator, - result, - pool -): + return {"status": 200, "message": "", "data": current_result} + + +def _get_qa_stage_detail(task_id, log_id, status, file_name, creator, result, pool): """获取QA阶段详情日志""" now = date_time_utils.now_str() current_result = None - if status == 'fail': + if status == "fail": current_result = { - 'status': status, - 'qa_count': 0, - 'message': result.get('message') + "status": status, + "qa_count": 0, + "message": result.get("message"), } else: current_result = { - 'status': status, - 'qa_count': result.get('data').get('object_count') + "status": status, + "qa_count": result.get("data").get("object_count"), } - qa_stage_detail='\n'.join([ - f"{now} Current Execution Stage: qa_split, File Name: {file_name}", - f"Current Result: {json_utils.dumps(current_result)}" - ]) - + qa_stage_detail = "\n".join( + [ + f"{now} Current Execution Stage: qa_split, File Name: {file_name}", + f"Current Result: {json_utils.dumps(current_result)}", + ] + ) + insert_stage_log_params = { - 'task_id': task_id, - 'log_id': log_id, - 'file_name': file_name, - 'stage_name': 'qa_split', - 'stage_status': status, - 'stage_detail': qa_stage_detail, - 'creator': creator + "task_id": task_id, + "log_id": log_id, + "file_name": file_name, + "stage_name": "qa_split", + "stage_status": status, + "stage_detail": qa_stage_detail, + "creator": creator, } - data_process_stage_log_db_operate.insert( - insert_stage_log_params, - pool=pool - ) + data_process_stage_log_db_operate.insert(insert_stage_log_params, pool=pool) - return { - 'status': 200, - 'message': '', - 'data': current_result - } + return {"status": 200, "message": "", "data": current_result} -def _list_for_transform_type( - task_id, - document_id, - transform_type, - pool -): - params={ - 'task_id': task_id, - 'document_id': document_id, - 'transform_type': transform_type +def _list_for_transform_type(task_id, document_id, transform_type, pool): + params = { + "task_id": task_id, + "document_id": document_id, + "transform_type": transform_type, } - return data_process_detail_db_operate.list_for_transform_type( - params, - pool=pool - ) + return data_process_detail_db_operate.list_for_transform_type(params, pool=pool) -def _update_status_and_log_id( - id, - current_log_id, - status, - end_datetime, - creator, - pool -): +# """update task status and current log id with task id""" +def _update_status_and_log_id(id, current_log_id, status, end_datetime, creator, pool): try: - """update task status and current log id with task id""" - logger.debug(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} update task status \n", - f"task_id: {id}\n", - f"status: {status}\n" - ])) + logger.debug( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} update task status \n", + f"task_id: {id}\n", + f"status: {status}\n", + ] + ) + ) update_task_params = { - 'id': id, - 'current_log_id': current_log_id, - 'status': status, - 'end_datetime': end_datetime, - 'user': creator + "id": id, + "current_log_id": current_log_id, + "status": status, + "end_datetime": end_datetime, + "user": creator, } - data_process_db_operate.update_status_and_log_id( - update_task_params, - pool=pool - ) + data_process_db_operate.update_status_and_log_id(update_task_params, pool=pool) - return { - 'status': 200, - 'message': '', - 'data': '' - } + return {"status": 200, "message": "", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} update task fail \n", - f"{traceback.format_exc()}" - ])) - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} update task fail \n", + f"{traceback.format_exc()}", + ] + ) + ) + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} -def _insert_log_info( - id, - task_id, - execute_type, - creator, - pool -): +# """insert task log info""" +def _insert_log_info(id, task_id, execute_type, creator, pool): try: - """insert task log info""" - logger.debug(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} insert task log \n", - f"task_id: {task_id}\n", - f"execute_type: {execute_type}\n" - ])) + logger.debug( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} insert task log \n", + f"task_id: {task_id}\n", + f"execute_type: {execute_type}\n", + ] + ) + ) insert_log_item = { - 'id': id, - 'task_id': task_id, - 'type': execute_type, - 'creator': creator + "id": id, + "task_id": task_id, + "type": execute_type, + "creator": creator, } - data_process_log_db_operate.add( - insert_log_item, - pool=pool - ) + data_process_log_db_operate.add(insert_log_item, pool=pool) - return { - 'status': 200, - 'message': '', - 'data': '' - } + return {"status": 200, "message": "", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} insert task log info \n", - f"{traceback.format_exc()}" - ])) - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} insert task log info \n", + f"{traceback.format_exc()}", + ] + ) + ) + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} -def _text_manipulate_retry_for_document( - document, - task_info, - log_id, - pool, - creator -): - file_name = document.get('file_name') - task_id = task_info.get('id') - document_id = document.get('id') - support_type = task_info.get('data_process_config_info') - +def _text_manipulate_retry_for_document(document, task_info, log_id, pool, creator): + file_name = document.get("file_name") + task_id = task_info.get("id") + document_id = document.get("id") + support_type = task_info.get("data_process_config_info") # 新增阶段性日志-开始 - received_task={ - 'task_id': task_id, - 'pre_dataset_name': document.get('pre_data_set_name'), - 'pre_dataset_version': document.get('pre_data_set_version'), - 'file_names': document.get('file_names') + received_task = { + "task_id": task_id, + "pre_dataset_name": document.get("pre_data_set_name"), + "pre_dataset_version": document.get("pre_data_set_version"), + "file_names": document.get("file_names"), } - start_stage_detail = '\n'.join([ - f"{date_time_utils.now_str()} Data Processing Task Retry Starts!!!", - f"Received Task: {json_utils.dumps(received_task)}", - f"Operations: {json_utils.dumps(support_type)}" - ]) + start_stage_detail = "\n".join( + [ + f"{date_time_utils.now_str()} Data Processing Task Retry Starts!!!", + f"Received Task: {json_utils.dumps(received_task)}", + f"Operations: {json_utils.dumps(support_type)}", + ] + ) insert_stage_log_params = { - 'task_id': task_id, - 'log_id': log_id, - 'file_name': file_name, - 'stage_name': 'start', - 'stage_status': 'success', - 'stage_detail': start_stage_detail, - 'creator': creator + "task_id": task_id, + "log_id": log_id, + "file_name": file_name, + "stage_name": "start", + "stage_status": "success", + "stage_detail": start_stage_detail, + "creator": creator, } - data_process_stage_log_db_operate.insert( - insert_stage_log_params, - pool=pool + data_process_stage_log_db_operate.insert(insert_stage_log_params, pool=pool) + + logger.debug( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} text manipulate retry \n", + f"document status: {document.get('status')}", + ] + ) ) - - logger.debug(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} text manipulate retry \n", - f"document status: {document.get('status')}" - ])) result = None # 判断文件状态 - if document.get('status') == 'not_start': + if document.get("status") == "not_start": # 针对未开始的文件进行重试 # minio 数据集统一前缀 minio_dataset_prefix = config.minio_dataset_prefix - folder_prefix = '/'.join([ - minio_dataset_prefix, - task_info.get('pre_data_set_name'), - task_info.get('pre_data_set_version') - ]) + folder_prefix = "/".join( + [ + minio_dataset_prefix, + task_info.get("pre_data_set_name"), + task_info.get("pre_data_set_version"), + ] + ) # get a minio client minio_client = minio_store_client.get_minio_client() # 将文件下载到本地 minio_store_client.download( minio_client, - bucket_name=task_info.get('namespace'), + bucket_name=task_info.get("namespace"), folder_prefix=folder_prefix, - file_name=file_name + file_name=file_name, ) - document_type = document.get('document_type') - if document_type in ['pdf']: + document_type = document.get("document_type") + if document_type in ["pdf"]: # 处理PDF文件 result = pdf_handle.text_manipulate( file_name=file_name, - document_id=document.get('id'), + document_id=document.get("id"), support_type=support_type, conn_pool=pool, task_id=task_id, - create_user=creator + create_user=creator, ) - - elif document_type in ['docx']: + + elif document_type in ["docx"]: # 处理.docx文件 result = word_handle.docx_text_manipulate( file_name=file_name, - document_id=document.get('id'), + document_id=document.get("id"), support_type=support_type, conn_pool=pool, task_id=task_id, - create_user=creator + create_user=creator, ) - + # 将下载的本地文件删除 _remove_local_file(file_name) @@ -1141,108 +1013,98 @@ def _text_manipulate_retry_for_document( # 针对进行中和失败的文件进行重试 # 获取未成功的chunk列表 - query_chunk_params = { - 'document_id': document.get('id') - } + query_chunk_params = {"document_id": document.get("id")} document_chunk_dict = data_process_document_chunk_db_operate.list_by_status( - query_chunk_params, - pool=pool + query_chunk_params, pool=pool ) - if len(document_chunk_dict.get('data')) > 0: + if len(document_chunk_dict.get("data")) > 0: result = common_handle.text_manipulate( file_name=file_name, - all_document_for_process=document_chunk_dict.get('data'), + all_document_for_process=document_chunk_dict.get("data"), support_type=support_type, conn_pool=pool, - create_user=creator + create_user=creator, ) # 判断是否存在qa拆分 - has_qa_split = any(item.get('type') == 'qa_split' for item in support_type) + has_qa_split = any(item.get("type") == "qa_split" for item in support_type) if result is None: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} The file type is not supported \n", - f"The current file type is: {document_type}" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} The file type is not supported \n", + f"The current file type is: {document_type}", + ] + ) + ) # 任务失败 error_msg = f"{document_type} 文件类型不支持" - return { - 'status': 400, - 'message': error_msg, - 'data': '' - } - + return {"status": 400, "message": error_msg, "data": ""} + # 新增阶段性日志-clean - clean_stage_detail=_get_stage_detail( + clean_stage_detail = _get_stage_detail( task_info, pool=pool, task_id=task_id, document_id=document_id, - stage='clean', - file_name=file_name + stage="clean", + file_name=file_name, ) - if clean_stage_detail.get('status') == 200: + if clean_stage_detail.get("status") == 200: insert_stage_log_params = { - 'task_id': task_id, - 'log_id': log_id, - 'file_name': file_name, - 'stage_name': 'clean', - 'stage_status': 'success', - 'stage_detail': clean_stage_detail.get('data'), - 'creator': creator + "task_id": task_id, + "log_id": log_id, + "file_name": file_name, + "stage_name": "clean", + "stage_status": "success", + "stage_detail": clean_stage_detail.get("data"), + "creator": creator, } - data_process_stage_log_db_operate.insert( - insert_stage_log_params, - pool=pool - ) + data_process_stage_log_db_operate.insert(insert_stage_log_params, pool=pool) # 新增阶段性日志-privacy - privacy_stage_detail=_get_stage_detail( + privacy_stage_detail = _get_stage_detail( task_info, pool=pool, task_id=task_id, document_id=document_id, - stage='privacy', - file_name=file_name + stage="privacy", + file_name=file_name, ) - if privacy_stage_detail.get('status') == 200: + if privacy_stage_detail.get("status") == 200: insert_stage_log_params = { - 'task_id': task_id, - 'log_id': log_id, - 'file_name': file_name, - 'stage_name': 'privacy', - 'stage_status': 'success', - 'stage_detail': privacy_stage_detail.get('data'), - 'creator': creator + "task_id": task_id, + "log_id": log_id, + "file_name": file_name, + "stage_name": "privacy", + "stage_status": "success", + "stage_detail": privacy_stage_detail.get("data"), + "creator": creator, } - data_process_stage_log_db_operate.insert( - insert_stage_log_params, - pool=pool - ) - + data_process_stage_log_db_operate.insert(insert_stage_log_params, pool=pool) + # 新增阶段性日志-qa_split if has_qa_split: - if result.get('status') != 200: + if result.get("status") != 200: _get_qa_stage_detail( task_id=task_id, log_id=log_id, - status='fail', + status="fail", file_name=file_name, creator=creator, result=result, - pool=pool + pool=pool, ) else: _get_qa_stage_detail( task_id=task_id, log_id=log_id, - status='success', + status="success", file_name=file_name, creator=creator, result=result, - pool=pool + pool=pool, ) return result - diff --git a/pypi/data-processing/src/database_clients/postgresql_pool_client.py b/pypi/data-processing/src/database_clients/postgresql_pool_client.py index abffa855d..d3f36d90f 100644 --- a/pypi/data-processing/src/database_clients/postgresql_pool_client.py +++ b/pypi/data-processing/src/database_clients/postgresql_pool_client.py @@ -16,9 +16,8 @@ import traceback import psycopg2.extras -from dbutils.pooled_db import PooledDB - from common import log_tag_const +from dbutils.pooled_db import PooledDB logger = logging.getLogger(__name__) @@ -32,7 +31,7 @@ def get_pool(connection_creator): maxcached=8, maxshared=8, maxconnections=8, - blocking=True + blocking=True, ) @@ -40,10 +39,13 @@ def release_pool(pool): """Release the database connection pool.""" if pool is not None: pool.close() - logger.debug(f"{log_tag_const.DATABASE_POSTGRESQL} Release the database connection pool.") + logger.debug( + f"{log_tag_const.DATABASE_POSTGRESQL} Release the database connection pool." + ) else: - logger.debug(f"{log_tag_const.DATABASE_POSTGRESQL} The database connection pool is None.") - + logger.debug( + f"{log_tag_const.DATABASE_POSTGRESQL} The database connection pool is None." + ) def get_connection_from_pool(pool): @@ -52,10 +54,9 @@ def get_connection_from_pool(pool): return pool.connection() - def execute_query(pool, sql, params={}): """Execute a query with the parameters.""" - error = '' + error = "" data = [] try: with pool.connection() as conn: @@ -72,29 +73,25 @@ def execute_query(pool, sql, params={}): except Exception as ex: error = str(ex) data = None - logger.error(''.join([ - f"{log_tag_const.DATABASE_POSTGRESQL} Executing the sql failed\n {sql} \n", - f"The error is: \n{error}\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) - + logger.error( + "".join( + [ + f"{log_tag_const.DATABASE_POSTGRESQL} Executing the sql failed\n {sql} \n", + f"The error is: \n{error}\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + if len(error) > 0: - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } - - return { - 'status': 200, - 'message': '', - "data": data - } - - + return {"status": 400, "message": error, "data": traceback.format_exc()} + + return {"status": 200, "message": "", "data": data} + + def execute_count_query(pool, sql, params={}): """Execute a count query with the parameters.""" - error = '' + error = "" data = None try: with pool.connection() as conn: @@ -104,29 +101,25 @@ def execute_count_query(pool, sql, params={}): except Exception as ex: error = str(ex) data = None - logger.error(''.join([ - f"{log_tag_const.DATABASE_POSTGRESQL} Executing the count sql failed\n {sql} \n", - f"\nThe error is: \n{error}\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) - + logger.error( + "".join( + [ + f"{log_tag_const.DATABASE_POSTGRESQL} Executing the count sql failed\n {sql} \n", + f"\nThe error is: \n{error}\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + if len(error) > 0: - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } - - return { - 'status': 200, - 'message': '', - "data": data - } + return {"status": 400, "message": error, "data": traceback.format_exc()} + + return {"status": 200, "message": "", "data": data} def execute_update(pool, sql, params={}): """Execute a update with the parameters.""" - error = '' + error = "" data = None try: with pool.connection() as conn: @@ -137,23 +130,17 @@ def execute_update(pool, sql, params={}): error = str(ex) data = None conn.rollback() - logger.error(''.join([ - f"{log_tag_const.DATABASE_POSTGRESQL} Executing the update sql failed\n {sql} \n", - f"\nThe error is: \n{error}\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) - - if len(error) > 0: - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } - - return { - 'status': 200, - 'message': '处理成功', - "data": data - } + logger.error( + "".join( + [ + f"{log_tag_const.DATABASE_POSTGRESQL} Executing the update sql failed\n {sql} \n", + f"\nThe error is: \n{error}\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + if len(error) > 0: + return {"status": 400, "message": error, "data": traceback.format_exc()} + return {"status": 200, "message": "处理成功", "data": data} diff --git a/pypi/data-processing/src/database_operate/data_process_db_operate.py b/pypi/data-processing/src/database_operate/data_process_db_operate.py index 2621f2ab7..a9a2346b0 100644 --- a/pypi/data-processing/src/database_operate/data_process_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_db_operate.py @@ -14,23 +14,17 @@ import ujson -import ulid -from sanic.response import json - from database_clients import postgresql_pool_client from utils import date_time_utils -def list_by_page( - req_json, - pool -): +def list_by_page(req_json, pool): """Get the list data for data processing by page""" params = { - 'keyword': '%' + req_json['keyword'] + '%', - 'namespace': req_json['namespace'], - 'pageIndex': int(req_json['pageIndex']), - 'pageSize': int(req_json['pageSize']) + "keyword": "%" + req_json["keyword"] + "%", + "namespace": req_json["namespace"], + "pageIndex": int(req_json["pageIndex"]), + "pageSize": int(req_json["pageSize"]), } sql = """ @@ -62,14 +56,11 @@ def list_by_page( return res -def list_by_count( - req_json, - pool -): +def list_by_count(req_json, pool): """Get count for the list data processing with page""" params = { - 'keyword': '%' + req_json['keyword'] + '%', - 'namespace': req_json['namespace'] + "keyword": "%" + req_json["keyword"] + "%", + "namespace": req_json["namespace"], } sql = """ @@ -86,14 +77,9 @@ def list_by_count( return res -def delete_by_id( - req_json, - pool -): +def delete_by_id(req_json, pool): """Delete a record with id""" - params = { - 'id': req_json['id'] - } + params = {"id": req_json["id"]} sql = """ delete from public.data_process_task @@ -105,36 +91,32 @@ def delete_by_id( return res -def add( - req_json, - pool, - id -): +def add(req_json, pool, id): """Add a new record""" now = date_time_utils.now_str() - user = req_json['creator'] - program = '数据处理任务-新增' + user = req_json["creator"] + program = "数据处理任务-新增" params = { - 'id': id, - 'name': req_json['name'], - 'file_type': req_json['file_type'], - 'status': 'processing', - 'namespace': req_json['namespace'], - 'pre_data_set_name': req_json['pre_data_set_name'], - 'pre_data_set_version': req_json['pre_data_set_version'], - 'pre_version_data_set_name': req_json['version_data_set_name'], - 'file_names': ujson.dumps(req_json['file_names']), - 'post_data_set_name': req_json['post_data_set_name'], - 'post_data_set_version': req_json['post_data_set_version'], - 'data_process_config_info': ujson.dumps(req_json['data_process_config_info']), - 'start_datetime': now, - 'create_datetime': now, - 'create_user': user, - 'create_program': program, - 'update_datetime': now, - 'update_user': user, - 'update_program': program + "id": id, + "name": req_json["name"], + "file_type": req_json["file_type"], + "status": "processing", + "namespace": req_json["namespace"], + "pre_data_set_name": req_json["pre_data_set_name"], + "pre_data_set_version": req_json["pre_data_set_version"], + "pre_version_data_set_name": req_json["version_data_set_name"], + "file_names": ujson.dumps(req_json["file_names"]), + "post_data_set_name": req_json["post_data_set_name"], + "post_data_set_version": req_json["post_data_set_version"], + "data_process_config_info": ujson.dumps(req_json["data_process_config_info"]), + "start_datetime": now, + "create_datetime": now, + "create_user": user, + "create_program": program, + "update_datetime": now, + "update_user": user, + "update_program": program, } sql = """ @@ -186,23 +168,20 @@ def add( return res -def update_status_by_id( - req_json, - pool -): +def update_status_by_id(req_json, pool): """Update the status with id""" now = date_time_utils.now_str() - user = req_json['user'] - program = '修改任务状态' + user = req_json["user"] + program = "修改任务状态" params = { - 'id': req_json['id'], - 'status': req_json['status'], - 'current_log_id': req_json['current_log_id'], - 'end_datetime': now, - 'update_datetime': now, - 'update_program': program, - 'update_user': user + "id": req_json["id"], + "status": req_json["status"], + "current_log_id": req_json["current_log_id"], + "end_datetime": now, + "update_datetime": now, + "update_program": program, + "update_user": user, } sql = """ @@ -221,14 +200,9 @@ def update_status_by_id( return res -def info_by_id( - req_json, - pool -): +def info_by_id(req_json, pool): """info with id""" - params = { - 'id': req_json['id'] - } + params = {"id": req_json["id"]} sql = """ select @@ -262,15 +236,10 @@ def info_by_id( res = postgresql_pool_client.execute_query(pool, sql, params) return res -def count_by_name( - req_json, - pool -): + +def count_by_name(req_json, pool): """Check for duplicate names.""" - params = { - 'name': req_json['name'], - 'namespace': req_json['namespace'] - } + params = {"name": req_json["name"], "namespace": req_json["namespace"]} sql = """ select @@ -286,22 +255,19 @@ def count_by_name( return res -def update_status_and_log_id( - req_json, - pool -): +def update_status_and_log_id(req_json, pool): """Update the status and current log id with task id""" - user = req_json['user'] - program = '修改任务状态' + user = req_json["user"] + program = "修改任务状态" params = { - 'id': req_json.get('id'), - 'status': req_json.get('status'), - 'current_log_id': req_json.get('current_log_id'), - 'end_datetime': req_json.get('end_datetime'), - 'update_datetime': req_json.get('end_datetime'), - 'update_program': program, - 'update_user': user + "id": req_json.get("id"), + "status": req_json.get("status"), + "current_log_id": req_json.get("current_log_id"), + "end_datetime": req_json.get("end_datetime"), + "update_datetime": req_json.get("end_datetime"), + "update_program": program, + "update_user": user, } sql = """ @@ -318,4 +284,3 @@ def update_status_and_log_id( res = postgresql_pool_client.execute_update(pool, sql, params) return res - diff --git a/pypi/data-processing/src/database_operate/data_process_detail_db_operate.py b/pypi/data-processing/src/database_operate/data_process_detail_db_operate.py index 631167634..c40eddf00 100644 --- a/pypi/data-processing/src/database_operate/data_process_detail_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_detail_db_operate.py @@ -17,32 +17,29 @@ from utils import date_time_utils -def insert_transform_info( - req_json, - pool -): +def insert_transform_info(req_json, pool): """Insert a transform info""" now = date_time_utils.now_str() - user = req_json['create_user'] - program = '数据处理任务详情-新增' + user = req_json["create_user"] + program = "数据处理任务详情-新增" params = { - 'id': req_json.get('id'), - 'task_id': req_json.get('task_id'), - 'document_id': req_json.get('document_id'), - 'document_chunk_id': req_json.get('document_chunk_id'), - 'file_name': req_json.get('file_name'), - 'transform_type': req_json.get('transform_type'), - 'pre_content': req_json.get('pre_content'), - 'post_content': req_json.get('post_content'), - 'status': req_json.get('status'), - 'error_message': req_json.get('error_message'), - 'create_datetime': now, - 'create_user': user, - 'create_program': program, - 'update_datetime': now, - 'update_user': user, - 'update_program': program + "id": req_json.get("id"), + "task_id": req_json.get("task_id"), + "document_id": req_json.get("document_id"), + "document_chunk_id": req_json.get("document_chunk_id"), + "file_name": req_json.get("file_name"), + "transform_type": req_json.get("transform_type"), + "pre_content": req_json.get("pre_content"), + "post_content": req_json.get("post_content"), + "status": req_json.get("status"), + "error_message": req_json.get("error_message"), + "create_datetime": now, + "create_user": user, + "create_program": program, + "update_datetime": now, + "update_user": user, + "update_program": program, } sql = """ @@ -88,29 +85,26 @@ def insert_transform_info( return res -def insert_question_answer_info( - req_json, - pool -): +def insert_question_answer_info(req_json, pool): """Insert a question answer info""" now = date_time_utils.now_str() - user = req_json['create_user'] - program = '数据处理任务问题和答案-新增' + user = req_json["create_user"] + program = "数据处理任务问题和答案-新增" params = { - 'id': req_json['id'], - 'task_id': req_json['task_id'], - 'document_id': req_json['document_id'], - 'document_chunk_id': req_json['document_chunk_id'], - 'file_name': req_json['file_name'], - 'question': req_json['question'], - 'answer': req_json['answer'], - 'create_datetime': now, - 'create_user': user, - 'create_program': program, - 'update_datetime': now, - 'update_user': user, - 'update_program': program + "id": req_json["id"], + "task_id": req_json["task_id"], + "document_id": req_json["document_id"], + "document_chunk_id": req_json["document_chunk_id"], + "file_name": req_json["file_name"], + "question": req_json["question"], + "answer": req_json["answer"], + "create_datetime": now, + "create_user": user, + "create_program": program, + "update_datetime": now, + "update_user": user, + "update_program": program, } sql = """ @@ -150,22 +144,19 @@ def insert_question_answer_info( return res -def list_file_name_for_transform( - req_json, - pool -): +def list_file_name_for_transform(req_json, pool): """List file name for transform in the task detail. - - req_json is a dictionary object. for example: - { - "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS", - "transform_type": "remove_invisible_characters" - } - pool: databasec connection pool; + + req_json is a dictionary object. for example: + { + "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS", + "transform_type": "remove_invisible_characters" + } + pool: databasec connection pool; """ params = { - 'task_id': req_json['task_id'], - 'transform_type': req_json['transform_type'], + "task_id": req_json["task_id"], + "transform_type": req_json["transform_type"], } sql = """ @@ -182,13 +173,10 @@ def list_file_name_for_transform( return res -def top_n_list_transform_for_preview( - req_json, - pool -): - """List transform info with task id, file name and +def top_n_list_transform_for_preview(req_json, pool): + """List transform info with task id, file name and transform type for preview. - + req_json is a dictionary object. for example: { "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS", @@ -198,9 +186,9 @@ def top_n_list_transform_for_preview( pool: databasec connection pool; """ params = { - 'task_id': req_json['task_id'], - 'file_name': req_json['file_name'], - 'transform_type': req_json['transform_type'] + "task_id": req_json["task_id"], + "file_name": req_json["file_name"], + "transform_type": req_json["transform_type"], } sql = """ @@ -224,23 +212,18 @@ def top_n_list_transform_for_preview( res = postgresql_pool_client.execute_query(pool, sql, params) return res - -def list_file_name_in_qa_by_task_id( - req_json, - pool -): + +def list_file_name_in_qa_by_task_id(req_json, pool): """List file name in question answer with task id. - + req_json is a dictionary object. for example: { "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS" } pool: databasec connection pool; """ - params = { - 'task_id': req_json['task_id'] - } + params = {"task_id": req_json["task_id"]} sql = """ select @@ -255,12 +238,9 @@ def list_file_name_in_qa_by_task_id( return res -def top_n_list_qa_for_preview( - req_json, - pool -): +def top_n_list_qa_for_preview(req_json, pool): """List question answer info with task id for preview. - + req_json is a dictionary object. for example: { "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS", @@ -268,9 +248,7 @@ def top_n_list_qa_for_preview( } pool: databasec connection pool; """ - params = { - 'task_id': req_json['task_id'] - } + params = {"task_id": req_json["task_id"]} sql = """ select @@ -296,21 +274,17 @@ def top_n_list_qa_for_preview( res = postgresql_pool_client.execute_query(pool, sql, params) return res -def delete_transform_by_task_id( - req_json, - pool -): + +def delete_transform_by_task_id(req_json, pool): """delete transform info by task id. - + req_json is a dictionary object. for example: { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } pool: databasec connection pool; """ - params = { - 'task_id': req_json['id'] - } + params = {"task_id": req_json["id"]} sql = """ delete from public.data_process_task_detail @@ -321,21 +295,17 @@ def delete_transform_by_task_id( res = postgresql_pool_client.execute_update(pool, sql, params) return res -def delete_qa_by_task_id( - req_json, - pool -): + +def delete_qa_by_task_id(req_json, pool): """delete qa info by task id. - + req_json is a dictionary object. for example: { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } pool: databasec connection pool; """ - params = { - 'task_id': req_json['id'] - } + params = {"task_id": req_json["id"]} sql = """ delete from public.data_process_task_question_answer @@ -347,21 +317,16 @@ def delete_qa_by_task_id( return res -def list_file_name_for_clean( - req_json, - pool -): +def list_file_name_for_clean(req_json, pool): """List file name for clean in the task detail. - - req_json is a dictionary object. for example: - { - "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS" - } - pool: databasec connection pool; - """ - params = { - 'task_id': req_json['task_id'] + + req_json is a dictionary object. for example: + { + "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS" } + pool: databasec connection pool; + """ + params = {"task_id": req_json["task_id"]} sql = """ select @@ -377,29 +342,26 @@ def list_file_name_for_clean( return res -def insert_question_answer_clean_info( - req_json, - pool -): +def insert_question_answer_clean_info(req_json, pool): """Insert a question answer clean info""" now = date_time_utils.now_str() - user = req_json['create_user'] - program = '数据处理任务问题和答案-新增' + user = req_json["create_user"] + program = "数据处理任务问题和答案-新增" params = { - 'id': req_json['id'], - 'task_id': req_json['task_id'], - 'document_id': req_json['document_id'], - 'document_chunk_id': req_json['document_chunk_id'], - 'file_name': req_json['file_name'], - 'question': req_json['question'], - 'answer': req_json['answer'], - 'create_datetime': now, - 'create_user': user, - 'create_program': program, - 'update_datetime': now, - 'update_user': user, - 'update_program': program + "id": req_json["id"], + "task_id": req_json["task_id"], + "document_id": req_json["document_id"], + "document_chunk_id": req_json["document_chunk_id"], + "file_name": req_json["file_name"], + "question": req_json["question"], + "answer": req_json["answer"], + "create_datetime": now, + "create_user": user, + "create_program": program, + "update_datetime": now, + "update_user": user, + "update_program": program, } sql = """ @@ -439,21 +401,16 @@ def insert_question_answer_clean_info( return res -def query_question_answer_list( - document_id, - pool -): +def query_question_answer_list(document_id, pool): """List question answer with document id. - + req_json is a dictionary object. for example: { "document_id": "01HGWBE48DT3ADE9ZKA62SW4WS" } pool: databasec connection pool; """ - params = { - 'document_id': document_id - } + params = {"document_id": document_id} sql = """ select @@ -477,21 +434,17 @@ def query_question_answer_list( res = postgresql_pool_client.execute_query(pool, sql, params) return res -def list_file_name_for_privacy( - req_json, - pool -): + +def list_file_name_for_privacy(req_json, pool): """List file name for privacy in the task detail. - - req_json is a dictionary object. for example: - { - "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS" - } - pool: databasec connection pool; - """ - params = { - 'task_id': req_json['task_id'] + + req_json is a dictionary object. for example: + { + "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS" } + pool: databasec connection pool; + """ + params = {"task_id": req_json["task_id"]} sql = """ select @@ -506,21 +459,17 @@ def list_file_name_for_privacy( res = postgresql_pool_client.execute_query(pool, sql, params) return res -def delete_qa_clean_by_task_id( - req_json, - pool -): + +def delete_qa_clean_by_task_id(req_json, pool): """delete qa clean info by task id. - + req_json is a dictionary object. for example: { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } pool: databasec connection pool; """ - params = { - 'task_id': req_json['id'] - } + params = {"task_id": req_json["id"]} sql = """ delete from public.data_process_task_question_answer_clean @@ -532,15 +481,12 @@ def delete_qa_clean_by_task_id( return res -def list_for_transform_type( - req_json, - pool -): +def list_for_transform_type(req_json, pool): """List transform for clean in the task detail.""" params = { - 'task_id': req_json.get('task_id'), - 'document_id': req_json.get('document_id'), - 'transform_type': tuple(req_json.get('transform_type')) + "task_id": req_json.get("task_id"), + "document_id": req_json.get("document_id"), + "transform_type": tuple(req_json.get("transform_type")), } sql = """ @@ -565,12 +511,9 @@ def list_for_transform_type( return res -def delete_transform_by_document_chunk( - req_json, - pool -): +def delete_transform_by_document_chunk(req_json, pool): """delete transform by task id and document id and chunk id. - + req_json is a dictionary object. for example: { "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS", @@ -580,9 +523,9 @@ def delete_transform_by_document_chunk( pool: databasec connection pool; """ params = { - 'task_id': req_json.get('task_id'), - 'document_id': req_json.get('document_id'), - 'document_chunk_id': req_json.get('document_chunk_id') + "task_id": req_json.get("task_id"), + "document_id": req_json.get("document_id"), + "document_chunk_id": req_json.get("document_chunk_id"), } sql = """ @@ -595,4 +538,3 @@ def delete_transform_by_document_chunk( res = postgresql_pool_client.execute_update(pool, sql, params) return res - diff --git a/pypi/data-processing/src/database_operate/data_process_detail_preview_db_operate.py b/pypi/data-processing/src/database_operate/data_process_detail_preview_db_operate.py index 0fccc402f..5ba868766 100644 --- a/pypi/data-processing/src/database_operate/data_process_detail_preview_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_detail_preview_db_operate.py @@ -14,28 +14,24 @@ from database_clients import postgresql_pool_client -from utils import date_time_utils -def insert( - req_json, - pool -): +def insert(req_json, pool): """Insert info""" params = { - 'id': req_json['id'], - 'task_id': req_json['task_id'], - 'file_name': req_json['file_name'], - 'transform_type': req_json['transform_type'], - 'pre_content': req_json['pre_content'], - 'post_content': req_json['post_content'], - 'create_datetime': req_json['create_datetime'], - 'create_user': req_json['create_user'], - 'create_program': req_json['create_program'], - 'update_datetime': req_json['update_datetime'], - 'update_user': req_json['update_user'], - 'update_program': req_json['update_program'] + "id": req_json["id"], + "task_id": req_json["task_id"], + "file_name": req_json["file_name"], + "transform_type": req_json["transform_type"], + "pre_content": req_json["pre_content"], + "post_content": req_json["post_content"], + "create_datetime": req_json["create_datetime"], + "create_user": req_json["create_user"], + "create_program": req_json["create_program"], + "update_datetime": req_json["update_datetime"], + "update_user": req_json["update_user"], + "update_program": req_json["update_program"], } sql = """ @@ -72,12 +68,10 @@ def insert( res = postgresql_pool_client.execute_update(pool, sql, params) return res -def list_file_name_by_task_id( - req_json, - pool -): + +def list_file_name_by_task_id(req_json, pool): """List file name with task id and transform_type. - + req_json is a dictionary object. for example: { "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS", @@ -86,8 +80,8 @@ def list_file_name_by_task_id( pool: databasec connection pool; """ params = { - 'task_id': req_json['task_id'], - 'transform_type': req_json['transform_type'] + "task_id": req_json["task_id"], + "transform_type": req_json["transform_type"], } sql = """ @@ -103,12 +97,10 @@ def list_file_name_by_task_id( res = postgresql_pool_client.execute_query(pool, sql, params) return res -def list_for_preview( - req_json, - pool -): + +def list_for_preview(req_json, pool): """List file name with task id and transform_type. - + req_json is a dictionary object. for example: { "task_id": "01HGWBE48DT3ADE9ZKA62SW4WS", @@ -117,8 +109,8 @@ def list_for_preview( pool: databasec connection pool; """ params = { - 'task_id': req_json['task_id'], - 'transform_type': req_json['transform_type'] + "task_id": req_json["task_id"], + "transform_type": req_json["transform_type"], } sql = """ @@ -139,21 +131,16 @@ def list_for_preview( return res -def delete_qa_by_task_id( - req_json, - pool -): +def delete_qa_by_task_id(req_json, pool): """delete qa info by task id. - + req_json is a dictionary object. for example: { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } pool: databasec connection pool; """ - params = { - 'task_id': req_json['id'] - } + params = {"task_id": req_json["id"]} sql = """ delete from public.data_process_task_detail_preview diff --git a/pypi/data-processing/src/database_operate/data_process_document_chunk_db_operate.py b/pypi/data-processing/src/database_operate/data_process_document_chunk_db_operate.py index 509b32737..655f42baa 100644 --- a/pypi/data-processing/src/database_operate/data_process_document_chunk_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_document_chunk_db_operate.py @@ -12,35 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. -import ulid - from database_clients import postgresql_pool_client from utils import date_time_utils -def add( - req_json, - pool -): +def add(req_json, pool): """Add a new record""" now = date_time_utils.now_str() - user = req_json['creator'] - program = '数据处理文件拆分-新增' + user = req_json["creator"] + program = "数据处理文件拆分-新增" params = { - 'id': req_json.get('id'), - 'document_id': req_json.get('document_id'), - 'status': req_json.get('status'), - 'task_id': req_json.get('task_id'), - 'content': req_json.get('content'), - 'meta_info': req_json.get('meta_info'), - 'page_number': req_json.get('page_number'), - 'create_datetime': now, - 'create_user': user, - 'create_program': program, - 'update_datetime': now, - 'update_user': user, - 'update_program': program + "id": req_json.get("id"), + "document_id": req_json.get("document_id"), + "status": req_json.get("status"), + "task_id": req_json.get("task_id"), + "content": req_json.get("content"), + "meta_info": req_json.get("meta_info"), + "page_number": req_json.get("page_number"), + "create_datetime": now, + "create_user": user, + "create_program": program, + "update_datetime": now, + "update_user": user, + "update_program": program, } sql = """ @@ -79,21 +74,19 @@ def add( res = postgresql_pool_client.execute_update(pool, sql, params) return res -def update_document_chunk_status_and_start_time( - req_json, - pool -): + +def update_document_chunk_status_and_start_time(req_json, pool): """Update the status and start time with id""" - now = req_json['start_time'] - program = '开始处理chunk后的内容' + now = req_json["start_time"] + program = "开始处理chunk后的内容" params = { - 'id': req_json['id'], - 'status': req_json['status'], - 'start_time': now, - 'update_datetime': now, - 'update_user': req_json['update_user'], - 'update_program': program + "id": req_json["id"], + "status": req_json["status"], + "start_time": now, + "update_datetime": now, + "update_user": req_json["update_user"], + "update_program": program, } sql = """ @@ -110,21 +103,19 @@ def update_document_chunk_status_and_start_time( res = postgresql_pool_client.execute_update(pool, sql, params) return res -def update_document_chunk_status_and_end_time( - req_json, - pool -): + +def update_document_chunk_status_and_end_time(req_json, pool): """Update the status and end time with id""" - now = req_json['end_time'] - program = 'chunk后的内容处理完成' + now = req_json["end_time"] + program = "chunk后的内容处理完成" params = { - 'id': req_json['id'], - 'status': req_json['status'], - 'end_time': now, - 'update_datetime': now, - 'update_user': req_json['update_user'], - 'update_program': program + "id": req_json["id"], + "status": req_json["status"], + "end_time": now, + "update_datetime": now, + "update_user": req_json["update_user"], + "update_program": program, } sql = """ @@ -141,21 +132,17 @@ def update_document_chunk_status_and_end_time( res = postgresql_pool_client.execute_update(pool, sql, params) return res -def delete_by_task_id( - req_json, - pool -): + +def delete_by_task_id(req_json, pool): """delete info by task id. - + req_json is a dictionary object. for example: { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } pool: databasec connection pool; """ - params = { - 'task_id': req_json['id'] - } + params = {"task_id": req_json["id"]} sql = """ delete from public.data_process_task_document_chunk @@ -166,14 +153,10 @@ def delete_by_task_id( res = postgresql_pool_client.execute_update(pool, sql, params) return res -def list_by_status( - req_json, - pool -): + +def list_by_status(req_json, pool): """Retrieve a list of statuses marked as in progress and failed.""" - params = { - 'document_id': req_json.get('document_id') - } + params = {"document_id": req_json.get("document_id")} sql = """ select diff --git a/pypi/data-processing/src/database_operate/data_process_document_db_operate.py b/pypi/data-processing/src/database_operate/data_process_document_db_operate.py index 2a27bbcd0..0f242b33d 100644 --- a/pypi/data-processing/src/database_operate/data_process_document_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_document_db_operate.py @@ -12,36 +12,31 @@ # See the License for the specific language governing permissions and # limitations under the License. -import ulid - from database_clients import postgresql_pool_client from utils import date_time_utils -def add( - req_json, - pool -): +def add(req_json, pool): """Add a new record""" now = date_time_utils.now_str() - user = req_json['creator'] - program = '数据处理文件进度-新增' + user = req_json["creator"] + program = "数据处理文件进度-新增" params = { - 'id': req_json.get('id'), - 'file_name': req_json.get('file_name'), - 'status': req_json.get('status'), - 'progress': req_json.get('progress'), - 'task_id': req_json.get('task_id'), - 'from_source_type': req_json.get('from_source_type'), - 'from_source_path': req_json.get('from_source_path'), - 'document_type': req_json.get('document_type'), - 'create_datetime': now, - 'create_user': user, - 'create_program': program, - 'update_datetime': now, - 'update_user': user, - 'update_program': program + "id": req_json.get("id"), + "file_name": req_json.get("file_name"), + "status": req_json.get("status"), + "progress": req_json.get("progress"), + "task_id": req_json.get("task_id"), + "from_source_type": req_json.get("from_source_type"), + "from_source_path": req_json.get("from_source_path"), + "document_type": req_json.get("document_type"), + "create_datetime": now, + "create_user": user, + "create_program": program, + "update_datetime": now, + "update_user": user, + "update_program": program, } sql = """ @@ -82,21 +77,19 @@ def add( res = postgresql_pool_client.execute_update(pool, sql, params) return res -def update_document_status_and_start_time( - req_json, - pool -): + +def update_document_status_and_start_time(req_json, pool): """Update the status and start time with id""" - now = req_json['start_time'] - program = '文件开始处理-修改' + now = req_json["start_time"] + program = "文件开始处理-修改" params = { - 'id': req_json['id'], - 'status': req_json['status'], - 'start_time': now, - 'chunk_size': req_json['chunk_size'], - 'update_datetime': now, - 'update_program': program + "id": req_json["id"], + "status": req_json["status"], + "start_time": now, + "chunk_size": req_json["chunk_size"], + "update_datetime": now, + "update_program": program, } sql = """ @@ -113,20 +106,18 @@ def update_document_status_and_start_time( res = postgresql_pool_client.execute_update(pool, sql, params) return res -def update_document_status_and_end_time( - req_json, - pool -): + +def update_document_status_and_end_time(req_json, pool): """Update the status and end time with id""" - now = req_json['end_time'] - program = '文件处理完成-修改' + now = req_json["end_time"] + program = "文件处理完成-修改" params = { - 'id': req_json['id'], - 'status': req_json['status'], - 'end_time': now, - 'update_datetime': now, - 'update_program': program + "id": req_json["id"], + "status": req_json["status"], + "end_time": now, + "update_datetime": now, + "update_program": program, } sql = """ @@ -142,20 +133,18 @@ def update_document_status_and_end_time( res = postgresql_pool_client.execute_update(pool, sql, params) return res -def update_document_progress( - req_json, - pool -): + +def update_document_progress(req_json, pool): """Update the progress with id""" now = date_time_utils.now_str() - program = '文件处理进度-修改' + program = "文件处理进度-修改" params = { - 'id': req_json['id'], - 'progress': req_json['progress'], - 'update_datetime': now, - 'update_user': req_json['update_user'], - 'update_program': program + "id": req_json["id"], + "progress": req_json["progress"], + "update_datetime": now, + "update_user": req_json["update_user"], + "update_program": program, } sql = """ @@ -170,14 +159,10 @@ def update_document_progress( res = postgresql_pool_client.execute_update(pool, sql, params) return res -def list_file_by_task_id( - req_json, - pool -): + +def list_file_by_task_id(req_json, pool): """info with id""" - params = { - 'task_id': req_json['task_id'] - } + params = {"task_id": req_json["task_id"]} sql = """ select @@ -196,21 +181,17 @@ def list_file_by_task_id( res = postgresql_pool_client.execute_query(pool, sql, params) return res -def delete_by_task_id( - req_json, - pool -): + +def delete_by_task_id(req_json, pool): """delete info by task id. - + req_json is a dictionary object. for example: { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } pool: databasec connection pool; """ - params = { - 'task_id': req_json['id'] - } + params = {"task_id": req_json["id"]} sql = """ delete from public.data_process_task_document @@ -222,14 +203,9 @@ def delete_by_task_id( return res -def list_by_task_id_and_status( - req_json, - pool -): +def list_by_task_id_and_status(req_json, pool): """info with task id and status""" - params = { - 'task_id': req_json.get('id') - } + params = {"task_id": req_json.get("id")} sql = """ select diff --git a/pypi/data-processing/src/database_operate/data_process_log_db_operate.py b/pypi/data-processing/src/database_operate/data_process_log_db_operate.py index f74d412c4..186695d90 100644 --- a/pypi/data-processing/src/database_operate/data_process_log_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_log_db_operate.py @@ -12,35 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import ujson - from database_clients import postgresql_pool_client from utils import date_time_utils -def add( - req_json, - pool -): +def add(req_json, pool): """Add a new record""" now = date_time_utils.now_str() - user = req_json['creator'] - program = '数据处理任务日志-新增' + user = req_json["creator"] + program = "数据处理任务日志-新增" params = { - 'id': req_json.get('id'), - 'task_id': req_json.get('task_id'), - 'type': req_json.get('type'), - 'status': 'processing', - 'error_msg': req_json.get('error_msg'), - 'start_datetime': now, - 'create_datetime': now, - 'create_user': user, - 'create_program': program, - 'update_datetime': now, - 'update_user': user, - 'update_program': program + "id": req_json.get("id"), + "task_id": req_json.get("task_id"), + "type": req_json.get("type"), + "status": "processing", + "error_msg": req_json.get("error_msg"), + "start_datetime": now, + "create_datetime": now, + "create_user": user, + "create_program": program, + "update_datetime": now, + "update_user": user, + "update_program": program, } sql = """ @@ -78,23 +72,20 @@ def add( return res -def update_status_by_id( - req_json, - pool -): +def update_status_by_id(req_json, pool): """Update the status with id""" now = date_time_utils.now_str() - user = req_json['creator'] - program = '添加错误日志信息' + user = req_json["creator"] + program = "添加错误日志信息" params = { - 'id': req_json['id'], - 'status': req_json['status'], - 'error_msg': req_json['error_msg'], - 'end_datetime': now, - 'update_datetime': now, - 'update_program': program, - 'update_user': user + "id": req_json["id"], + "status": req_json["status"], + "error_msg": req_json["error_msg"], + "end_datetime": now, + "update_datetime": now, + "update_program": program, + "update_user": user, } sql = """ @@ -113,21 +104,16 @@ def update_status_by_id( return res -def delete_by_task_id( - req_json, - pool -): +def delete_by_task_id(req_json, pool): """delete info by task id. - + req_json is a dictionary object. for example: { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } pool: databasec connection pool; """ - params = { - 'task_id': req_json['id'] - } + params = {"task_id": req_json["id"]} sql = """ delete from public.data_process_task_log @@ -137,4 +123,3 @@ def delete_by_task_id( res = postgresql_pool_client.execute_update(pool, sql, params) return res - diff --git a/pypi/data-processing/src/database_operate/data_process_stage_log_db_operate.py b/pypi/data-processing/src/database_operate/data_process_stage_log_db_operate.py index 4f1a3800b..204755f93 100644 --- a/pypi/data-processing/src/database_operate/data_process_stage_log_db_operate.py +++ b/pypi/data-processing/src/database_operate/data_process_stage_log_db_operate.py @@ -14,36 +14,32 @@ import ulid - from database_clients import postgresql_pool_client from utils import date_time_utils -def insert( - req_json, - pool -): +def insert(req_json, pool): """Add a new record""" now = date_time_utils.now_str() - user = req_json['creator'] - program = '数据处理任务阶段日志-新增' + user = req_json["creator"] + program = "数据处理任务阶段日志-新增" params = { - 'id': ulid.ulid(), - 'task_id': req_json.get('task_id'), - 'log_id': req_json.get('log_id'), - 'log_datetime': now, - 'file_name': req_json.get('file_name'), - 'stage_name': req_json.get('stage_name'), - 'stage_status': req_json.get('stage_status'), - 'stage_detail': req_json.get('stage_detail'), - 'error_msg': req_json.get('error_msg'), - 'create_datetime': now, - 'create_user': user, - 'create_program': program, - 'update_datetime': now, - 'update_user': user, - 'update_program': program + "id": ulid.ulid(), + "task_id": req_json.get("task_id"), + "log_id": req_json.get("log_id"), + "log_datetime": now, + "file_name": req_json.get("file_name"), + "stage_name": req_json.get("stage_name"), + "stage_status": req_json.get("stage_status"), + "stage_detail": req_json.get("stage_detail"), + "error_msg": req_json.get("error_msg"), + "create_datetime": now, + "create_user": user, + "create_program": program, + "update_datetime": now, + "update_user": user, + "update_program": program, } sql = """ @@ -87,14 +83,9 @@ def insert( return res -def list_by_task_id( - req_json, - pool -): +def list_by_task_id(req_json, pool): """Get the list data for data processing log by task id""" - params = { - 'task_id': req_json.get('id') - } + params = {"task_id": req_json.get("id")} sql = """ select @@ -117,21 +108,16 @@ def list_by_task_id( return res -def delete_by_task_id( - req_json, - pool -): +def delete_by_task_id(req_json, pool): """delete info by task id. - + req_json is a dictionary object. for example: { "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } pool: databasec connection pool; """ - params = { - 'task_id': req_json['id'] - } + params = {"task_id": req_json["id"]} sql = """ delete from public.data_process_task_stage_log @@ -143,14 +129,11 @@ def delete_by_task_id( return res -def info_by_stage_and_file_name( - req_json, - pool -): +def info_by_stage_and_file_name(req_json, pool): params = { - 'task_id': req_json.get('id'), - 'stage_name': req_json.get('type'), - 'file_name': req_json.get('file_name') + "task_id": req_json.get("id"), + "stage_name": req_json.get("type"), + "file_name": req_json.get("file_name"), } sql = """ diff --git a/pypi/data-processing/src/file_handle/common_handle.py b/pypi/data-processing/src/file_handle/common_handle.py index e252501ff..2b5b713bc 100644 --- a/pypi/data-processing/src/file_handle/common_handle.py +++ b/pypi/data-processing/src/file_handle/common_handle.py @@ -15,13 +15,9 @@ import base64 import logging -import os import traceback -import pandas as pd import ulid -from langchain.text_splitter import SpacyTextSplitter - from common import log_tag_const from common.config import config from database_operate import (data_process_detail_db_operate, @@ -32,27 +28,23 @@ from llm_api_service.qa_provider_zhi_pu_ai_online import \ QAProviderZhiPuAIOnline from transform.text import clean_transform, privacy_transform -from utils import csv_utils, date_time_utils, docx_utils, file_utils +from utils import csv_utils, date_time_utils, file_utils logger = logging.getLogger(__name__) def text_manipulate( - all_document_for_process, - file_name, - support_type, - conn_pool, - create_user + all_document_for_process, file_name, support_type, conn_pool, create_user ): """Manipulate the text content. - + all_document_for_process: document info file_name: file name; support_type: support type; conn_pool: database connection pool; create_user: creator; """ - + logger.debug(f"{log_tag_const.COMMON_HANDLE} Start to manipulate the text") try: @@ -60,48 +52,48 @@ def text_manipulate( document_chunk_size = len(all_document_for_process) # 更新文件状态为开始 - task_id = all_document_for_process[0].get('task_id') - document_id = all_document_for_process[0].get('document_id') + task_id = all_document_for_process[0].get("task_id") + document_id = all_document_for_process[0].get("document_id") _update_document_status_and_start_time( - id=all_document_for_process[0].get('document_id'), + id=all_document_for_process[0].get("document_id"), chunk_size=document_chunk_size, - conn_pool=conn_pool + conn_pool=conn_pool, ) text_process_success_num = 0 for document in all_document_for_process: - document_chunk_id = document.get('id') + document_chunk_id = document.get("id") # Clean the data such as removing invisible characters. clean_result = _data_clean( support_type_map=support_type_map, file_name=file_name, - data=document.get('content'), + data=document.get("content"), conn_pool=conn_pool, task_id=task_id, document_id=document_id, document_chunk_id=document_chunk_id, - create_user=create_user + create_user=create_user, ) - if clean_result['status'] == 200: - content = clean_result['data'] + if clean_result["status"] == 200: + content = clean_result["data"] # Remove the privacy info such as removing email. clean_result = _remove_privacy_info( support_type_map=support_type_map, file_name=file_name, - data=document.get('content'), + data=document.get("content"), conn_pool=conn_pool, task_id=task_id, document_id=document_id, document_chunk_id=document_chunk_id, - create_user=create_user + create_user=create_user, ) - if clean_result['status'] == 200: - content = clean_result['data'] + if clean_result["status"] == 200: + content = clean_result["data"] - if support_type_map.get('qa_split'): + if support_type_map.get("qa_split"): logger.debug(f"{log_tag_const.QA_SPLIT} Start to split QA.") text_process_success_num += 1 @@ -115,64 +107,64 @@ def text_manipulate( document_id=document_id, text_process_success_num=text_process_success_num, conn_pool=conn_pool, - create_user=create_user + create_user=create_user, ) - if qa_response.get('status') != 200: + if qa_response.get("status") != 200: return qa_response - + # 文件处理成功,更新data_process_task_document中的文件状态 _updata_document_status_and_end_time( - id=document_id, - status='success', - conn_pool=conn_pool + id=document_id, status="success", conn_pool=conn_pool ) # 通过documentId查询生成的所有QA数据 qa_list = data_process_detail_db_operate.query_question_answer_list( - document_id=document_id, - pool=conn_pool + document_id=document_id, pool=conn_pool ) - qa_data_dict = [['q', 'a', 'file_name', 'page_number', 'chunk_content']] - for item in qa_list.get('data'): - qa_data_dict.append([ - item.get('question'), - item.get('answer'), - item.get('file_name'), - item.get('page_number'), - item.get('content') - ]) - - # Save the csv file. - file_name_without_extension = file_utils.get_file_name_without_extension(file_name) - file_name_csv = file_name_without_extension + '.csv' + qa_data_dict = [["q", "a", "file_name", "page_number", "chunk_content"]] + for item in qa_list.get("data"): + qa_data_dict.append( + [ + item.get("question"), + item.get("answer"), + item.get("file_name"), + item.get("page_number"), + item.get("content"), + ] + ) + + # Save the csv file. + file_name_without_extension = file_utils.get_file_name_without_extension( + file_name + ) + file_name_csv = file_name_without_extension + ".csv" csv_utils.save_csv( - file_name=file_name_csv, - phase_value='final', - data=qa_data_dict + file_name=file_name_csv, phase_value="final", data=qa_data_dict ) - + logger.debug(f"{log_tag_const.COMMON_HANDLE} Finish manipulating the text") return { - 'status': 200, - 'message': '', - 'data': { - 'object_name': file_name_csv, - 'object_count': len(qa_list.get('data')) - } + "status": 200, + "message": "", + "data": { + "object_name": file_name_csv, + "object_count": len(qa_list.get("data")), + }, } except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.COMMON_HANDLE} There is an error when manipulate ", - f"the text in common handler. \n{traceback.format_exc()}" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.COMMON_HANDLE} There is an error when manipulate ", + f"the text in common handler. \n{traceback.format_exc()}", + ] + ) + ) logger.debug(f"{log_tag_const.COMMON_HANDLE} Finish manipulating the text") - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} + def _data_clean( support_type_map, @@ -182,20 +174,20 @@ def _data_clean( document_chunk_id, file_name, create_user, - conn_pool + conn_pool, ): """Clean the data. - + support_type_map: example { "qa_split": { "type": "qa_split", "name": "xx", "namespace": "xx" - }, + }, "remove_invisible_characters": { "type": "remove_invisible_characters" - }, + }, "space_standardization": { "type": "space_standardization" }, @@ -209,333 +201,294 @@ def _data_clean( task_id: data process task id; """ # remove invisible characters - if support_type_map.get('remove_invisible_characters'): - result = clean_transform.remove_invisible_characters( - text=data - ) - if result['status'] == 200: - clean_data = result['data']['clean_data'] + if support_type_map.get("remove_invisible_characters"): + result = clean_transform.remove_invisible_characters(text=data) + if result["status"] == 200: + clean_data = result["data"]["clean_data"] if len(clean_data) > 0: for item in clean_data: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_invisible_characters', - 'pre_content': item['pre_content'], - 'post_content': item['post_content'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_invisible_characters", + "pre_content": item["pre_content"], + "post_content": item["post_content"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_invisible_characters', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_invisible_characters", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - # process for space standardization - if support_type_map.get('space_standardization'): - result = clean_transform.space_standardization( - text=data - ) - if result['status'] == 200: - clean_data = result['data']['clean_data'] + if support_type_map.get("space_standardization"): + result = clean_transform.space_standardization(text=data) + if result["status"] == 200: + clean_data = result["data"]["clean_data"] if len(clean_data) > 0: for item in clean_data: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'space_standardization', - 'pre_content': item['pre_content'], - 'post_content': item['post_content'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "space_standardization", + "pre_content": item["pre_content"], + "post_content": item["post_content"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'space_standardization', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "space_standardization", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - # process for remove garbled text - if support_type_map.get('remove_garbled_text'): - result = clean_transform.remove_garbled_text( - text=data - ) - if result['status'] == 200: - if result['data']['found'] > 0: + if support_type_map.get("remove_garbled_text"): + result = clean_transform.remove_garbled_text(text=data) + if result["status"] == 200: + if result["data"]["found"] > 0: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_garbled_text', - 'pre_content': data, - 'post_content': result['data']['text'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_garbled_text", + "pre_content": data, + "post_content": result["data"]["text"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_garbled_text', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_garbled_text", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - # process for Traditional Chinese to Simplified Chinese - if support_type_map.get('traditional_to_simplified'): - result = clean_transform.traditional_to_simplified( - text=data - ) - if result['status'] == 200: - if result['data']['found'] > 0: + if support_type_map.get("traditional_to_simplified"): + result = clean_transform.traditional_to_simplified(text=data) + if result["status"] == 200: + if result["data"]["found"] > 0: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'traditional_to_simplified', - 'pre_content': data, - 'post_content': result['data']['text'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "traditional_to_simplified", + "pre_content": data, + "post_content": result["data"]["text"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'traditional_to_simplified', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "traditional_to_simplified", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - # process for clean html code in text samples - if support_type_map.get('remove_html_tag'): - result = clean_transform.remove_html_tag( - text=data - ) - if result['status'] == 200: - if result['data']['found'] > 0: + if support_type_map.get("remove_html_tag"): + result = clean_transform.remove_html_tag(text=data) + if result["status"] == 200: + if result["data"]["found"] > 0: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_html_tag', - 'pre_content': data, - 'post_content': result['data']['text'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_html_tag", + "pre_content": data, + "post_content": result["data"]["text"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_html_tag', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_html_tag", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - # process for remove emojis - if support_type_map.get('remove_emojis'): - result = clean_transform.remove_emojis( - text=data - ) - if result['status'] == 200: - clean_data = result['data']['clean_data'] + if support_type_map.get("remove_emojis"): + result = clean_transform.remove_emojis(text=data) + if result["status"] == 200: + clean_data = result["data"]["clean_data"] if len(clean_data) > 0: for item in clean_data: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_emojis', - 'pre_content': item['pre_content'], - 'post_content': item['post_content'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_emojis", + "pre_content": item["pre_content"], + "post_content": item["post_content"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_emojis', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_emojis", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - return { - 'status': 200, - 'message': '', - 'data': data - } + return {"status": 200, "message": "", "data": data} def _remove_privacy_info( @@ -546,20 +499,20 @@ def _remove_privacy_info( document_chunk_id, file_name, create_user, - conn_pool + conn_pool, ): - """"Remove the privacy info such as removing email. - + """ "Remove the privacy info such as removing email. + support_type_map: example { "qa_split": { "type": "qa_split", "name": "xx", "namespace": "xx" - }, + }, "remove_invisible_characters": { "type": "remove_invisible_characters" - }, + }, "space_standardization": { "type": "space_standardization" }, @@ -573,333 +526,298 @@ def _remove_privacy_info( task_id: data process task id; """ # remove email - if support_type_map.get('remove_email'): - result = privacy_transform.remove_email( - text=data - ) - if result['status'] == 200: - clean_data = result['data']['clean_data'] + if support_type_map.get("remove_email"): + result = privacy_transform.remove_email(text=data) + if result["status"] == 200: + clean_data = result["data"]["clean_data"] if len(clean_data) > 0: for item in clean_data: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_email', - 'pre_content': item['pre_content'], - 'post_content': item['post_content'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_email", + "pre_content": item["pre_content"], + "post_content": item["post_content"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_email', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_email", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - # remove ip addresses - if support_type_map.get('remove_ip_address'): - result = privacy_transform.remove_ip_address( - text=data - ) - if result['status'] == 200: - clean_data = result['data']['clean_data'] + if support_type_map.get("remove_ip_address"): + result = privacy_transform.remove_ip_address(text=data) + if result["status"] == 200: + clean_data = result["data"]["clean_data"] if len(clean_data) > 0: for item in clean_data: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_ip_address', - 'pre_content': item['pre_content'], - 'post_content': item['post_content'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_ip_address", + "pre_content": item["pre_content"], + "post_content": item["post_content"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_ip_address', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_ip_address", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) # remove number - if support_type_map.get('remove_number'): + if support_type_map.get("remove_number"): # remove phone - result = privacy_transform.remove_phone( - text=data - ) - if result['status'] == 200: - clean_data = result['data']['clean_data'] + result = privacy_transform.remove_phone(text=data) + if result["status"] == 200: + clean_data = result["data"]["clean_data"] if len(clean_data) > 0: for item in clean_data: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_number', - 'pre_content': item['pre_content'], - 'post_content': item['post_content'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_number", + "pre_content": item["pre_content"], + "post_content": item["post_content"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_number', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_number", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - + # remove id card - result = privacy_transform.remove_id_card( - text=data - ) - if result['status'] == 200: - clean_data = result['data']['clean_data'] + result = privacy_transform.remove_id_card(text=data) + if result["status"] == 200: + clean_data = result["data"]["clean_data"] if len(clean_data) > 0: for item in clean_data: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_number', - 'pre_content': item['pre_content'], - 'post_content': item['post_content'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_number", + "pre_content": item["pre_content"], + "post_content": item["post_content"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_number', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_number", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) # remove weixin - result = privacy_transform.remove_weixin( - text=data - ) - if result['status'] == 200: - clean_data = result['data']['clean_data'] + result = privacy_transform.remove_weixin(text=data) + if result["status"] == 200: + clean_data = result["data"]["clean_data"] if len(clean_data) > 0: for item in clean_data: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_number', - 'pre_content': item['pre_content'], - 'post_content': item['post_content'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_number", + "pre_content": item["pre_content"], + "post_content": item["post_content"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_number', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_number", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) # remove bank card - result = privacy_transform.remove_bank_card( - text=data - ) - if result['status'] == 200: - clean_data = result['data']['clean_data'] + result = privacy_transform.remove_bank_card(text=data) + if result["status"] == 200: + clean_data = result["data"]["clean_data"] if len(clean_data) > 0: for item in clean_data: # 避免重试的时候,新增重复性数据 delete_transform_item = { - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, } data_process_detail_db_operate.delete_transform_by_document_chunk( - delete_transform_item, - pool=conn_pool + delete_transform_item, pool=conn_pool ) task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_number', - 'pre_content': item['pre_content'], - 'post_content': item['post_content'], - 'status': 'success', - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_number", + "pre_content": item["pre_content"], + "post_content": item["post_content"], + "status": "success", + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - data = result['data']['text'] + data = result["data"]["text"] else: task_detail_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'transform_type': 'remove_number', - 'status': 'fail', - 'error_message': result.get('message'), - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "transform_type": "remove_number", + "status": "fail", + "error_message": result.get("message"), + "create_user": create_user, } data_process_detail_db_operate.insert_transform_info( - task_detail_item, - pool=conn_pool + task_detail_item, pool=conn_pool ) - - return { - 'status': 200, - 'message': '', - 'data': data - } + + return {"status": 200, "message": "", "data": data} def _qa_split( @@ -912,70 +830,61 @@ def _qa_split( document_id, text_process_success_num, conn_pool, - create_user + create_user, ): - qa_list_dict = support_type_map.get('qa_split') - llm_config = qa_list_dict.get('llm_config') + qa_list_dict = support_type_map.get("qa_split") + llm_config = qa_list_dict.get("llm_config") # 更新chunk状态为开始 _update_document_chunk_status_and_start_time( - id=document_chunk_id, - update_user=create_user, - conn_pool=conn_pool + id=document_chunk_id, update_user=create_user, conn_pool=conn_pool ) - qa_response = _generate_qa_list( - content=content, - llm_config=llm_config - ) + qa_response = _generate_qa_list(content=content, llm_config=llm_config) - if qa_response.get('status') != 200: + if qa_response.get("status") != 200: # 处理失败 # 更新data_process_task_document_chunk中的状态 _updata_document_chunk_status_and_end_time( id=document_chunk_id, update_user=create_user, - status='fail', - conn_pool=conn_pool + status="fail", + conn_pool=conn_pool, ) # 更新data_process_task_document中的文件状态 _updata_document_status_and_end_time( - id=document_id, - status='fail', - conn_pool=conn_pool + id=document_id, status="fail", conn_pool=conn_pool ) else: # 将QA数据存入表中 - qa_data = qa_response.get('data') + qa_data = qa_response.get("data") for i in range(len(qa_data)): qa_insert_item = { - 'id': ulid.ulid(), - 'task_id': task_id, - 'document_id': document_id, - 'document_chunk_id': document_chunk_id, - 'file_name': file_name, - 'question': qa_data[i][0], - 'answer': qa_data[i][1], - 'create_user': create_user + "id": ulid.ulid(), + "task_id": task_id, + "document_id": document_id, + "document_chunk_id": document_chunk_id, + "file_name": file_name, + "question": qa_data[i][0], + "answer": qa_data[i][1], + "create_user": create_user, } - + data_process_detail_db_operate.insert_question_answer_info( - qa_insert_item, - pool=conn_pool + qa_insert_item, pool=conn_pool ) data_process_detail_db_operate.insert_question_answer_clean_info( - qa_insert_item, - pool=conn_pool + qa_insert_item, pool=conn_pool ) # 更新data_process_task_document_chunk中的状态 _updata_document_chunk_status_and_end_time( id=document_chunk_id, update_user=create_user, - status='success', - conn_pool=conn_pool + status="success", + conn_pool=conn_pool, ) # 更新文件处理进度 @@ -984,93 +893,90 @@ def _qa_split( id=document_id, progress=progress, update_user=create_user, - conn_pool=conn_pool + conn_pool=conn_pool, ) return qa_response -def _generate_qa_list( - content, - llm_config -): +def _generate_qa_list(content, llm_config): """Generate the Question and Answer list. content: the text used to generate QA; llm_config: llms config info; """ - name=llm_config.get('name') - namespace=llm_config.get('namespace') - model=llm_config.get('model') - temperature=llm_config.get('temperature') - prompt_template=llm_config.get('prompt_template') - top_p=llm_config.get('top_p') - max_tokens=llm_config.get('max_tokens') + name = llm_config.get("name") + namespace = llm_config.get("namespace") + model = llm_config.get("model") + temperature = llm_config.get("temperature") + prompt_template = llm_config.get("prompt_template") + top_p = llm_config.get("top_p") + max_tokens = llm_config.get("max_tokens") # llms cr 中模型相关信息 - llm_spec_info = model_cr.get_spec_for_llms_k8s_cr( - name=name, - namespace=namespace - ) + llm_spec_info = model_cr.get_spec_for_llms_k8s_cr(name=name, namespace=namespace) # Generate the QA list. qa_list = [] - if llm_spec_info.get('data').get('provider').get('worker'): + if llm_spec_info.get("data").get("provider").get("worker"): # get base url for configmap base_url = model_cr.get_worker_base_url_k8s_configmap( - name=config.k8s_default_config, - namespace=config.k8s_pod_namespace + name=config.k8s_default_config, namespace=config.k8s_pod_namespace + ) + logger.debug( + "".join( + [ + f"worker llm \n", + f"name: {name}\n", + f"namespace: {namespace}\n", + f"model: {model}\n", + f"base_url: {base_url}\n", + ] + ) ) - logger.debug(''.join([ - f"worker llm \n", - f"name: {name}\n", - f"namespace: {namespace}\n", - f"model: {model}\n", - f"base_url: {base_url}\n" - ])) # generate QA list qa_provider = QAProviderOpenAI( - api_key='fake', + api_key="fake", base_url=base_url, model=model, temperature=temperature, - max_tokens=max_tokens + max_tokens=max_tokens, ) data = qa_provider.generate_qa_list( - text=content, - prompt_template=prompt_template + text=content, prompt_template=prompt_template ) - if data.get('status') != 200: + if data.get("status") != 200: # 文件处理失败 return data - qa_list.extend(data.get('data')) + qa_list.extend(data.get("data")) else: - endpoint = llm_spec_info.get('data').get('provider').get('endpoint') - base_url = endpoint.get('url') - secret_name = endpoint.get('authSecret').get('name') + endpoint = llm_spec_info.get("data").get("provider").get("endpoint") + base_url = endpoint.get("url") + secret_name = endpoint.get("authSecret").get("name") # get api key for secret - secret_info = model_cr.get_secret_info( - name=secret_name, - namespace=namespace + secret_info = model_cr.get_secret_info(name=secret_name, namespace=namespace) + api_key = secret_info.get("apiKey") + llm_type = llm_spec_info.get("data").get("type") + + logger.debug( + "".join( + [ + f"3rd_party llm \n", + f"name: {name}\n", + f"namespace: {namespace}\n", + f"model: {model}\n", + f"llm_type: {llm_type}\n", + ] + ) ) - api_key = secret_info.get('apiKey') - llm_type = llm_spec_info.get('data').get('type') - - logger.debug(''.join([ - f"3rd_party llm \n", - f"name: {name}\n", - f"namespace: {namespace}\n", - f"model: {model}\n", - f"llm_type: {llm_type}\n" - ])) - - if llm_type == 'zhipuai': - zhipuai_api_key = base64.b64decode(api_key).decode('utf-8') + + if llm_type == "zhipuai": + zhipuai_api_key = base64.b64decode(api_key).decode("utf-8") qa_provider = QAProviderZhiPuAIOnline(api_key=zhipuai_api_key) # generate QA list @@ -1079,29 +985,21 @@ def _generate_qa_list( model=model, prompt_template=prompt_template, top_p=top_p, - temperature=temperature + temperature=temperature, ) - if data.get('status') != 200: + if data.get("status") != 200: return data - qa_list.extend(data.get('data')) + qa_list.extend(data.get("data")) else: - return { - 'status': 1000, - 'message': '暂时不支持该类型的模型', - 'data': '' - } + return {"status": 1000, "message": "暂时不支持该类型的模型", "data": ""} - return { - 'status': 200, - 'message': '', - 'data': qa_list - } + return {"status": 200, "message": "", "data": qa_list} def _convert_support_type_to_map(supprt_type): """Convert support type to map. - + support_type: support type list example [ @@ -1121,176 +1019,130 @@ def _convert_support_type_to_map(supprt_type): """ result = {} for item in supprt_type: - result[item['type']] = item + result[item["type"]] = item return result -def _update_document_status_and_start_time( - id, - chunk_size, - conn_pool -): + +def _update_document_status_and_start_time(id, chunk_size, conn_pool): try: now = date_time_utils.now_str() document_update_item = { - 'id': id, - 'status': 'doing', - 'start_time': now, - 'chunk_size': chunk_size + "id": id, + "status": "doing", + "start_time": now, + "chunk_size": chunk_size, } data_process_document_db_operate.update_document_status_and_start_time( - document_update_item, - pool=conn_pool + document_update_item, pool=conn_pool ) - return { - 'status': 200, - 'message': '', - 'data': '' - } + return {"status": 200, "message": "", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.COMMON_HANDLE} update document status ", - f"\n{traceback.format_exc()}" - ])) - return { - 'status': 1000, - 'message': str(ex), - 'data': traceback.format_exc() - } + logger.error( + "".join( + [ + f"{log_tag_const.COMMON_HANDLE} update document status ", + f"\n{traceback.format_exc()}", + ] + ) + ) + return {"status": 1000, "message": str(ex), "data": traceback.format_exc()} -def _updata_document_status_and_end_time( - id, - status, - conn_pool -): + +def _updata_document_status_and_end_time(id, status, conn_pool): try: now = date_time_utils.now_str() - document_update_item = { - 'id': id, - 'status': status, - 'end_time': now - } + document_update_item = {"id": id, "status": status, "end_time": now} data_process_document_db_operate.update_document_status_and_end_time( - document_update_item, - pool=conn_pool + document_update_item, pool=conn_pool ) - return { - 'status': 200, - 'message': '', - 'data': '' - } + return {"status": 200, "message": "", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.COMMON_HANDLE} update document status ", - f"\n{traceback.format_exc()}" - ])) - return { - 'status': 1000, - 'message': str(ex), - 'data': traceback.format_exc() - } + logger.error( + "".join( + [ + f"{log_tag_const.COMMON_HANDLE} update document status ", + f"\n{traceback.format_exc()}", + ] + ) + ) + return {"status": 1000, "message": str(ex), "data": traceback.format_exc()} -def _updata_document_progress( - id, - progress, - update_user, - conn_pool -): + +def _updata_document_progress(id, progress, update_user, conn_pool): try: now = date_time_utils.now_str() document_update_item = { - 'id': id, - 'update_user': update_user, - 'progress': progress + "id": id, + "update_user": update_user, + "progress": progress, } data_process_document_db_operate.update_document_progress( - document_update_item, - pool=conn_pool + document_update_item, pool=conn_pool ) - return { - 'status': 200, - 'message': '', - 'data': '' - } + return {"status": 200, "message": "", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.COMMON_HANDLE} update document progress ", - f"\n{traceback.format_exc()}" - ])) - return { - 'status': 1000, - 'message': str(ex), - 'data': traceback.format_exc() - } + logger.error( + "".join( + [ + f"{log_tag_const.COMMON_HANDLE} update document progress ", + f"\n{traceback.format_exc()}", + ] + ) + ) + return {"status": 1000, "message": str(ex), "data": traceback.format_exc()} -def _update_document_chunk_status_and_start_time( - id, - update_user, - conn_pool -): + +def _update_document_chunk_status_and_start_time(id, update_user, conn_pool): try: now = date_time_utils.now_str() document_chunk_update_item = { - 'id': id, - 'status': 'doing', - 'update_user': update_user, - 'start_time': now + "id": id, + "status": "doing", + "update_user": update_user, + "start_time": now, } data_process_document_chunk_db_operate.update_document_chunk_status_and_start_time( - document_chunk_update_item, - pool=conn_pool + document_chunk_update_item, pool=conn_pool ) - return { - 'status': 200, - 'message': '', - 'data': '' - } + return {"status": 200, "message": "", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.COMMON_HANDLE} update chunk document status ", - f"\n{traceback.format_exc()}" - ])) - return { - 'status': 1000, - 'message': str(ex), - 'data': traceback.format_exc() - } + logger.error( + "".join( + [ + f"{log_tag_const.COMMON_HANDLE} update chunk document status ", + f"\n{traceback.format_exc()}", + ] + ) + ) + return {"status": 1000, "message": str(ex), "data": traceback.format_exc()} -def _updata_document_chunk_status_and_end_time( - id, - status, - update_user, - conn_pool -): + +def _updata_document_chunk_status_and_end_time(id, status, update_user, conn_pool): try: now = date_time_utils.now_str() document_chunk_update_item = { - 'id': id, - 'status': status, - 'update_user': update_user, - 'end_time': now + "id": id, + "status": status, + "update_user": update_user, + "end_time": now, } data_process_document_chunk_db_operate.update_document_chunk_status_and_end_time( - document_chunk_update_item, - pool=conn_pool + document_chunk_update_item, pool=conn_pool ) - return { - 'status': 200, - 'message': '', - 'data': '' - } + return {"status": 200, "message": "", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.COMMON_HANDLE} update document status ", - f"\n{traceback.format_exc()}" - ])) - return { - 'status': 1000, - 'message': str(ex), - 'data': traceback.format_exc() - } + logger.error( + "".join( + [ + f"{log_tag_const.COMMON_HANDLE} update document status ", + f"\n{traceback.format_exc()}", + ] + ) + ) + return {"status": 1000, "message": str(ex), "data": traceback.format_exc()} diff --git a/pypi/data-processing/src/file_handle/csv_handle.py b/pypi/data-processing/src/file_handle/csv_handle.py index 339f1e8fd..fddf5d281 100644 --- a/pypi/data-processing/src/file_handle/csv_handle.py +++ b/pypi/data-processing/src/file_handle/csv_handle.py @@ -18,20 +18,16 @@ import pandas as pd import ulid - from common import log_tag_const -from transform.text import clean_transform, privacy_transform -from utils import csv_utils, date_time_utils, file_utils +from transform.text import clean_transform +from utils import csv_utils, file_utils logger = logging.getLogger(__name__) -def text_manipulate( - file_name, - support_type -): +def text_manipulate(file_name, support_type): """Manipuate the text content. - + file_name: file name; support_type: support type; @@ -40,110 +36,84 @@ def text_manipulate( 整个文件都视作处理失败。 """ try: - logger.debug(f"{log_tag_const.CSV_HANDLE} Start to manipulate text in csv file.") + logger.debug( + f"{log_tag_const.CSV_HANDLE} Start to manipulate text in csv file." + ) csv_file_path = file_utils.get_temp_file_path() - file_path = csv_file_path + 'original/' + file_name + file_path = csv_file_path + "original/" + file_name # 获取CSV文件的内容 data = pd.read_csv(file_path) - text_data = data['prompt'] + text_data = data["prompt"] # 数据清洗 - clean_result = _data_clean({ - 'support_type': support_type, - 'file_name': file_name, - 'data': text_data - }) + clean_result = _data_clean( + support_type=support_type, data=text_data, file_name=file_name + ) - if clean_result['status'] != 200: + if clean_result["status"] != 200: return clean_result - text_data = clean_result['data'] - + text_data = clean_result["data"] + # 将清洗后的文件保存为final - new_file_name = file_utils.get_file_name({ - 'file_name': file_name, - 'handle_name': 'final' - }) - - save_csv({ - 'file_name': new_file_name, - 'phase_value': 'final', - 'data': text_data - }) - - logger.debug(f"{log_tag_const.CSV_HANDLE} Finish manipulating text in csv file.") - - return { - 'status': 200, - 'message': '', - 'data': '' - } + new_file_name = file_utils.get_file_name( + {"file_name": file_name, "handle_name": "final"} + ) + + csv_utils.save_csv( + {"file_name": new_file_name, "phase_value": "final", "data": text_data} + ) + + logger.debug( + f"{log_tag_const.CSV_HANDLE} Finish manipulating text in csv file." + ) + + return {"status": 200, "message": "", "data": ""} except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.CSV_HANDLE} There is a error when mainpulate the text ", - f"in a csv file. \n{traceback.format_exc()}" - ])) - return { - 'status': 400, - 'message': '', - 'data': '' - } - - -def _data_clean( - support_type, - data, - file_name -): + logger.error( + "".join( + [ + f"{log_tag_const.CSV_HANDLE} There is a error when mainpulate the text ", + f"in a csv file. \n{traceback.format_exc()}", + ] + ) + ) + return {"status": 400, "message": "", "data": ""} + + +def _data_clean(support_type, data, file_name): """Clean the data. - + support_type: support type; data: text content; """ logger.debug(f"{log_tag_const.CSV_HANDLE} Start to clean data in csv.") # 去除不可见字符 - if 'remove_invisible_characters' in support_type: + if "remove_invisible_characters" in support_type: clean_data = [] for item in data: - result = clean_transform.remove_invisible_characters({ - 'text': item - }) - - if result['status'] != 200: - return { - 'status': 400, - 'message': '去除不可见字符失败', - 'data': '' - } - - clean_data.append(result['data']) - + result = clean_transform.remove_invisible_characters({"text": item}) + + if result["status"] != 200: + return {"status": 400, "message": "去除不可见字符失败", "data": ""} + + clean_data.append(result["data"]) + data = clean_data - data.insert(0, ['prompt']) + data.insert(0, ["prompt"]) # 将文件存为middle - file_name = file_utils.get_file_name({ - 'file_name': file_name, - 'handle_name': 'middle' - }) - - csv_utils.save_csv({ - 'file_name': file_name, - 'phase_value': 'middle', - 'data': data - }) - - - logger.debug(f"{log_tag_const.CSV_HANDLE} Finish cleaning data in csv.") - - return { - 'status': 200, - 'message': '', - 'data': data - } + file_name = file_utils.get_file_name( + {"file_name": file_name, "handle_name": "middle"} + ) + csv_utils.save_csv( + {"file_name": file_name, "phase_value": "middle", "data": data} + ) + logger.debug(f"{log_tag_const.CSV_HANDLE} Finish cleaning data in csv.") + return {"status": 200, "message": "", "data": data} diff --git a/pypi/data-processing/src/file_handle/pdf_handle.py b/pypi/data-processing/src/file_handle/pdf_handle.py index e61d30512..0c0f49787 100644 --- a/pypi/data-processing/src/file_handle/pdf_handle.py +++ b/pypi/data-processing/src/file_handle/pdf_handle.py @@ -18,13 +18,12 @@ import ujson import ulid -from langchain.document_loaders import PyPDFLoader -from langchain.text_splitter import SpacyTextSplitter - from common import log_tag_const from common.config import config from database_operate import data_process_document_chunk_db_operate from file_handle import common_handle +from langchain.document_loaders import PyPDFLoader +from langchain.text_splitter import SpacyTextSplitter from utils import file_utils logger = logging.getLogger(__name__) @@ -38,10 +37,10 @@ def text_manipulate( task_id, create_user, chunk_size=None, - chunk_overlap=None + chunk_overlap=None, ): """Manipulate the text content from a pdf file. - + file_name: file name; support_type: support type; conn_pool: database connection pool; @@ -49,18 +48,16 @@ def text_manipulate( chunk_size: chunk size; chunk_overlap: chunk overlap; """ - + logger.debug(f"{log_tag_const.PDF_HANDLE} Start to manipulate the text in pdf") try: pdf_file_path = file_utils.get_temp_file_path() - file_path = pdf_file_path + 'original/' + file_name - + file_path = pdf_file_path + "original/" + file_name + # Text splitter documents = _get_documents_by_langchain( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - file_path=file_path + chunk_size=chunk_size, chunk_overlap=chunk_overlap, file_path=file_path ) # step 2 @@ -68,25 +65,24 @@ def text_manipulate( all_document_for_process = [] for document in documents: chunck_id = ulid.ulid() - page = document.metadata.get('page') + 1 + page = document.metadata.get("page") + 1 content = document.page_content.replace("\n", "") meta_info = document.metadata - meta_info['source'] = file_name + meta_info["source"] = file_name chunk_insert_item = { - 'id': chunck_id, - 'document_id': document_id, - 'task_id': task_id, - 'status': 'not_start', - 'content': content, - 'meta_info': ujson.dumps(meta_info, ensure_ascii=False), - 'page_number': page, - 'creator': create_user + "id": chunck_id, + "document_id": document_id, + "task_id": task_id, + "status": "not_start", + "content": content, + "meta_info": ujson.dumps(meta_info, ensure_ascii=False), + "page_number": page, + "creator": create_user, } all_document_for_process.append(chunk_insert_item) data_process_document_chunk_db_operate.add( - chunk_insert_item, - pool=conn_pool + chunk_insert_item, pool=conn_pool ) response = common_handle.text_manipulate( @@ -94,27 +90,24 @@ def text_manipulate( all_document_for_process=all_document_for_process, support_type=support_type, conn_pool=conn_pool, - create_user=create_user + create_user=create_user, ) return response except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.PDF_HANDLE} There is an error when manipulate ", - f"the text in pdf handler. \n{traceback.format_exc()}" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.PDF_HANDLE} There is an error when manipulate ", + f"the text in pdf handler. \n{traceback.format_exc()}", + ] + ) + ) logger.debug(f"{log_tag_const.PDF_HANDLE} Finish manipulating the text in pdf") - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } - -def _get_documents_by_langchain( - chunk_size, - chunk_overlap, - file_path -): + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} + + +def _get_documents_by_langchain(chunk_size, chunk_overlap, file_path): # Split the text. if chunk_size is None: chunk_size = config.knowledge_chunk_size @@ -129,7 +122,7 @@ def _get_documents_by_langchain( separator="\n\n", pipeline="zh_core_web_sm", chunk_size=int(chunk_size), - chunk_overlap=int(chunk_overlap) + chunk_overlap=int(chunk_overlap), ) documents = text_splitter.split_documents(pdf_pages) diff --git a/pypi/data-processing/src/file_handle/word_handle.py b/pypi/data-processing/src/file_handle/word_handle.py index ffacfc0c1..b96c9e52f 100644 --- a/pypi/data-processing/src/file_handle/word_handle.py +++ b/pypi/data-processing/src/file_handle/word_handle.py @@ -17,12 +17,11 @@ import traceback import ulid -from langchain.text_splitter import SpacyTextSplitter - from common import log_tag_const from common.config import config from database_operate import data_process_document_chunk_db_operate from file_handle import common_handle +from langchain.text_splitter import SpacyTextSplitter from utils import docx_utils, file_utils logger = logging.getLogger(__name__) @@ -36,10 +35,10 @@ def docx_text_manipulate( task_id, create_user, chunk_size=None, - chunk_overlap=None + chunk_overlap=None, ): """Manipulate the text content from a word file. - + file_name: file name; support_type: support type; conn_pool: database connection pool; @@ -47,18 +46,16 @@ def docx_text_manipulate( chunk_size: chunk size; chunk_overlap: chunk overlap; """ - + logger.debug(f"{log_tag_const.WORD_HANDLE} Start to manipulate the text in word") try: word_file_path = file_utils.get_temp_file_path() - file_path = word_file_path + 'original/' + file_name - + file_path = word_file_path + "original/" + file_name + # Text splitter documents = _get_documents_by_langchain( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - file_path=file_path + chunk_size=chunk_size, chunk_overlap=chunk_overlap, file_path=file_path ) # step 2 @@ -68,20 +65,19 @@ def docx_text_manipulate( chunck_id = ulid.ulid() content = document.replace("\n", "") chunk_insert_item = { - 'id': chunck_id, - 'document_id': document_id, - 'task_id': task_id, - 'status': 'not_start', - 'content': content, - 'meta_info': '', - 'page_number': '', - 'creator': create_user + "id": chunck_id, + "document_id": document_id, + "task_id": task_id, + "status": "not_start", + "content": content, + "meta_info": "", + "page_number": "", + "creator": create_user, } all_document_for_process.append(chunk_insert_item) data_process_document_chunk_db_operate.add( - chunk_insert_item, - pool=conn_pool + chunk_insert_item, pool=conn_pool ) response = common_handle.text_manipulate( @@ -89,27 +85,26 @@ def docx_text_manipulate( all_document_for_process=all_document_for_process, support_type=support_type, conn_pool=conn_pool, - create_user=create_user + create_user=create_user, ) return response except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.WORD_HANDLE} There is an error when manipulate ", - f"the text in word handler. \n{traceback.format_exc()}" - ])) - logger.debug(f"{log_tag_const.WORD_HANDLE} Finish manipulating the text in word") - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } - -def _get_documents_by_langchain( - chunk_size, - chunk_overlap, - file_path -): + logger.error( + "".join( + [ + f"{log_tag_const.WORD_HANDLE} There is an error when manipulate ", + f"the text in word handler. \n{traceback.format_exc()}", + ] + ) + ) + logger.debug( + f"{log_tag_const.WORD_HANDLE} Finish manipulating the text in word" + ) + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} + + +def _get_documents_by_langchain(chunk_size, chunk_overlap, file_path): # Split the text. if chunk_size is None: chunk_size = config.knowledge_chunk_size @@ -122,7 +117,7 @@ def _get_documents_by_langchain( separator="\n\n", pipeline="zh_core_web_sm", chunk_size=int(chunk_size), - chunk_overlap=int(chunk_overlap) + chunk_overlap=int(chunk_overlap), ) documents = text_splitter.split_text(content) diff --git a/pypi/data-processing/src/kube/client.py b/pypi/data-processing/src/kube/client.py index 9d79c055d..4207fc9c4 100644 --- a/pypi/data-processing/src/kube/client.py +++ b/pypi/data-processing/src/kube/client.py @@ -17,11 +17,10 @@ import os import traceback +from common import log_tag_const from kubernetes import client, config from kubernetes.client import CoreV1Api, CustomObjectsApi -from common import log_tag_const - from .custom_resources import (arcadia_resource_datasets, arcadia_resource_datasources, arcadia_resource_models, @@ -44,8 +43,8 @@ def get_name(self): class KubeEnv: def __init__(self): - self.pod_namespace = os.environ.get('POD_NAMESPACE') - self.kubeconfig_path = os.environ.get('KUBECONFIG') + self.pod_namespace = os.environ.get("POD_NAMESPACE") + self.kubeconfig_path = os.environ.get("KUBECONFIG") if self.kubeconfig_path: config.load_kube_config(self.kubeconfig_path) logger.debug( @@ -60,39 +59,44 @@ def __init__(self): except config.ConfigException: logger.error( f"{log_tag_const.KUBERNETES} There is an error ", - f"when load kubeconfig from in cluster config.\n {traceback.format_exc()}" + f"when load kubeconfig from in cluster config.\n {traceback.format_exc()}", ) - raise RuntimeError(''.join([ - "Failed to load incluster config. ", - "Make sure the code is running inside a Kubernetes cluster." - ])) - + raise RuntimeError( + "".join( + [ + "Failed to load incluster config. ", + "Make sure the code is running inside a Kubernetes cluster.", + ] + ) + ) + def list_datasources(self, namespace: str, **kwargs): return CustomObjectsApi().list_namespaced_custom_object( arcadia_resource_datasources.get_group(), arcadia_resource_datasources.get_version(), namespace, arcadia_resource_datasources.get_name(), - **kwargs + **kwargs, ) def list_datasets(self, namespace: str, **kwargs): return CustomObjectsApi().list_namespaced_custom_object( arcadia_resource_datasets.get_group(), arcadia_resource_datasets.get_version(), - namespace, + namespace, arcadia_resource_datasets.get_name(), - **kwargs + **kwargs, ) def list_versioneddatasets(self, namespace: str, **kwargs): return CustomObjectsApi().list_namespaced_custom_object( arcadia_resource_versioneddatasets.get_group(), arcadia_resource_versioneddatasets.get_version(), - namespace, arcadia_resource_versioneddatasets.get_name(), - **kwargs + namespace, + arcadia_resource_versioneddatasets.get_name(), + **kwargs, ) - + def patch_versioneddatasets_status(self, namespace: str, name: str, status: any): CustomObjectsApi().patch_namespaced_custom_object_status( arcadia_resource_versioneddatasets.get_group(), @@ -100,18 +104,18 @@ def patch_versioneddatasets_status(self, namespace: str, name: str, status: any) namespace, arcadia_resource_versioneddatasets.get_name(), name, - status + status, ) - + def get_versioneddatasets_status(self, namespace: str, name: str): return CustomObjectsApi().get_namespaced_custom_object_status( arcadia_resource_versioneddatasets.get_group(), arcadia_resource_versioneddatasets.get_version(), - namespace, + namespace, arcadia_resource_versioneddatasets.get_name(), - name + name, ) - + def patch_versioneddatasets_status(self, namespace: str, name: str, status: any): CustomObjectsApi().patch_namespaced_custom_object_status( arcadia_resource_versioneddatasets.get_group(), @@ -119,30 +123,24 @@ def patch_versioneddatasets_status(self, namespace: str, name: str, status: any) namespace, arcadia_resource_versioneddatasets.get_name(), name, - status + status, ) def get_versionedmodels_status(self, namespace: str, name: str): return CustomObjectsApi().get_namespaced_custom_object_status( arcadia_resource_models.get_group(), arcadia_resource_models.get_version(), - namespace, + namespace, arcadia_resource_models.get_name(), - name + name, ) def read_namespaced_config_map(self, namespace: str, name: str): - return CoreV1Api().read_namespaced_config_map( - namespace=namespace, - name=name - ) + return CoreV1Api().read_namespaced_config_map(namespace=namespace, name=name) def get_secret_info(self, namespace: str, name: str): """Get the secret info.""" - data = CoreV1Api().read_namespaced_secret( - namespace=namespace, - name=name - ) + data = CoreV1Api().read_namespaced_secret(namespace=namespace, name=name) return data.data def get_datasource_object(self, namespace: str, name: str): @@ -151,6 +149,6 @@ def get_datasource_object(self, namespace: str, name: str): group=arcadia_resource_models.get_group(), version=arcadia_resource_models.get_version(), namespace=namespace, - plural= arcadia_resource_datasources.get_name(), - name=name - ) \ No newline at end of file + plural=arcadia_resource_datasources.get_name(), + name=name, + ) diff --git a/pypi/data-processing/src/kube/custom_resources.py b/pypi/data-processing/src/kube/custom_resources.py index 6a25d8a4c..495d5a771 100644 --- a/pypi/data-processing/src/kube/custom_resources.py +++ b/pypi/data-processing/src/kube/custom_resources.py @@ -43,5 +43,4 @@ def get_name(self): # CRD LLM arcadia_resource_models = CustomResource(arcadia_group, "llms") # CRD Versioneddataset -arcadia_resource_versioneddatasets = CustomResource( - arcadia_group, "versioneddatasets") +arcadia_resource_versioneddatasets = CustomResource(arcadia_group, "versioneddatasets") diff --git a/pypi/data-processing/src/kube/dataset_cr.py b/pypi/data-processing/src/kube/dataset_cr.py index 54bc7b436..b74eaf0cc 100644 --- a/pypi/data-processing/src/kube/dataset_cr.py +++ b/pypi/data-processing/src/kube/dataset_cr.py @@ -20,14 +20,10 @@ logger = logging.getLogger(__name__) -def update_dataset_k8s_cr( - namespace, - version_data_set_name, - reason, - message -): - """ Update the condition info for the dataset. - + +def update_dataset_k8s_cr(namespace, version_data_set_name, reason, message): + """Update the condition info for the dataset. + namespace: namespace; version_data_set_name: version dataset name; reason: the update reason; @@ -36,68 +32,52 @@ def update_dataset_k8s_cr( kube = client.KubeEnv() one_cr_datasets = kube.get_versioneddatasets_status( - namespace, - version_data_set_name - ) + namespace, version_data_set_name + ) - conditions = one_cr_datasets['status']['conditions'] + conditions = one_cr_datasets["status"]["conditions"] now_utc_str = date_time_utils.now_utc_str() found_index = None for i in range(len(conditions)): item = conditions[i] - if item['type'] == 'DataProcessing': + if item["type"] == "DataProcessing": found_index = i break - result = None if found_index is None: - conditions.append({ - 'lastTransitionTime': now_utc_str, - 'reason': reason, - 'status': "True", - "type": "DataProcessing", - "message": message - }) + conditions.append( + { + "lastTransitionTime": now_utc_str, + "reason": reason, + "status": "True", + "type": "DataProcessing", + "message": message, + } + ) else: conditions[found_index] = { - 'lastTransitionTime': now_utc_str, - 'reason': reason, - 'status': "True", + "lastTransitionTime": now_utc_str, + "reason": reason, + "status": "True", "type": "DataProcessing", - "message": message + "message": message, } kube.patch_versioneddatasets_status( - namespace, - version_data_set_name, - { - 'status': { - 'conditions': conditions - } - } + namespace, version_data_set_name, {"status": {"conditions": conditions}} ) - return { - 'status': 200, - 'message': '更新数据集状态成功', - 'data': '' - } + return {"status": 200, "message": "更新数据集状态成功", "data": ""} except Exception as ex: logger.error(str(ex)) - return { - 'status': 400, - 'message': '更新数据集状态失败', - 'data': '' - } - -def get_dataset_status_k8s_cr( - namespace, - version_data_set_name -): - """ get the condition info for the dataset. - + return {"status": 400, "message": "更新数据集状态失败", "data": ""} + + +def get_dataset_status_k8s_cr(namespace, version_data_set_name): + """get the condition info for the dataset. + namespace: namespace; version_data_set_name: version dataset name; """ @@ -106,33 +86,23 @@ def get_dataset_status_k8s_cr( kube = client.KubeEnv() one_cr_datasets = kube.get_versioneddatasets_status( - namespace, - version_data_set_name - ) + namespace, version_data_set_name + ) - conditions = one_cr_datasets['status']['conditions'] + conditions = one_cr_datasets["status"]["conditions"] found_index = None for i in range(len(conditions)): item = conditions[i] - if item['type'] == 'DataProcessing': + if item["type"] == "DataProcessing": found_index = i break - result = None if found_index: - dataset_status = conditions[found_index].get('reason') + dataset_status = conditions[found_index].get("reason") - return { - 'status': 200, - 'message': '获取数据集状态成功', - 'data': dataset_status - } + return {"status": 200, "message": "获取数据集状态成功", "data": dataset_status} except Exception as ex: logger.error(str(ex)) - return { - 'status': 400, - 'message': '获取数据集状态失败', - 'data': '' - } + return {"status": 400, "message": "获取数据集状态失败", "data": ""} diff --git a/pypi/data-processing/src/kube/minio_cr.py b/pypi/data-processing/src/kube/minio_cr.py index 0cc226202..12a4b2d5c 100644 --- a/pypi/data-processing/src/kube/minio_cr.py +++ b/pypi/data-processing/src/kube/minio_cr.py @@ -23,12 +23,9 @@ logger = logging.getLogger(__name__) -def get_minio_config_in_k8s_configmap( - namespace, - config_map_name -): +def get_minio_config_in_k8s_configmap(namespace, config_map_name): """Get the MinIO config info in the configmap. - + namespace: namespace; config_map_name: config map name """ @@ -36,46 +33,51 @@ def get_minio_config_in_k8s_configmap( kube = client.KubeEnv() config_map = kube.read_namespaced_config_map( - namespace=namespace, - name=config_map_name + namespace=namespace, name=config_map_name ) - config = config_map.data.get('config') - + config = config_map.data.get("config") + json_data = yaml.safe_load(config) - datasource = json_data['systemDatasource'] + datasource = json_data["systemDatasource"] minio_cr_object = kube.get_datasource_object( - namespace=datasource['namespace'], - name=datasource['name'] + namespace=datasource["namespace"], name=datasource["name"] ) - minio_api_url = minio_cr_object['spec']['endpoint']['url'] + minio_api_url = minio_cr_object["spec"]["endpoint"]["url"] minio_secure = True - insecure = minio_cr_object['spec']['endpoint'].get('insecure') + insecure = minio_cr_object["spec"]["endpoint"].get("insecure") if insecure is None: minio_secure = True - elif str(insecure).lower() == 'true': + elif str(insecure).lower() == "true": minio_secure = False - secret_info = kube.get_secret_info( namespace=namespace, - name=minio_cr_object['spec']['endpoint']['authSecret']['name'] - ) + name=minio_cr_object["spec"]["endpoint"]["authSecret"]["name"], + ) return { - 'minio_api_url': minio_api_url, - 'minio_secure': minio_secure, - 'minio_access_key': base64.b64decode(secret_info['rootUser']).decode('utf-8'), - 'minio_secret_key': base64.b64decode(secret_info['rootPassword']).decode('utf-8') + "minio_api_url": minio_api_url, + "minio_secure": minio_secure, + "minio_access_key": base64.b64decode(secret_info["rootUser"]).decode( + "utf-8" + ), + "minio_secret_key": base64.b64decode(secret_info["rootPassword"]).decode( + "utf-8" + ), } except Exception as ex: - logger.error(''.join([ - f"Can not get the MinIO config info. The error is: \n", - f"{traceback.format_exc()}\n" - ])) - - return None \ No newline at end of file + logger.error( + "".join( + [ + f"Can not get the MinIO config info. The error is: \n", + f"{traceback.format_exc()}\n", + ] + ) + ) + + return None diff --git a/pypi/data-processing/src/kube/model_cr.py b/pypi/data-processing/src/kube/model_cr.py index 171125d48..2cc986d52 100644 --- a/pypi/data-processing/src/kube/model_cr.py +++ b/pypi/data-processing/src/kube/model_cr.py @@ -16,67 +16,47 @@ import traceback import yaml - from utils import date_time_utils from . import client logger = logging.getLogger(__name__) -def get_spec_for_llms_k8s_cr( - name, - namespace -): - """ get worker model. - + +def get_spec_for_llms_k8s_cr(name, namespace): + """get worker model. + name: model name; namespace: namespace; """ try: kube = client.KubeEnv() - one_cr_llm = kube.get_versionedmodels_status( - namespace=namespace, - name=name - ) + one_cr_llm = kube.get_versionedmodels_status(namespace=namespace, name=name) - provider = one_cr_llm['spec'] + provider = one_cr_llm["spec"] - return { - 'status': 200, - 'message': '获取llms中的provider成功', - 'data': provider - } + return {"status": 200, "message": "获取llms中的provider成功", "data": provider} except Exception as ex: logger.error(str(ex)) - return { - 'status': 400, - 'message': '获取llms中的provider失败', - 'data': '' - } - - -def get_worker_base_url_k8s_configmap( - name, - namespace -): - """ get base url for configmap. - + return {"status": 400, "message": "获取llms中的provider失败", "data": ""} + + +def get_worker_base_url_k8s_configmap(name, namespace): + """get base url for configmap. + name: model name; namespace: namespace; """ try: kube = client.KubeEnv() - config_map = kube.read_namespaced_config_map( - name=name, - namespace=namespace - ) + config_map = kube.read_namespaced_config_map(name=name, namespace=namespace) + + config = config_map.data.get("config") - config = config_map.data.get('config') - json_data = yaml.safe_load(config) - external_api_server = json_data.get('gateway', {}).get('apiServer') + external_api_server = json_data.get("gateway", {}).get("apiServer") return external_api_server except Exception as ex: @@ -84,32 +64,24 @@ def get_worker_base_url_k8s_configmap( return None -def get_secret_info( - name, - namespace -): - """ get secret info by name and namespace. - +def get_secret_info(name, namespace): + """get secret info by name and namespace. + name: model name; namespace: namespace; """ try: kube = client.KubeEnv() - return kube.get_secret_info( - namespace=namespace, - name=name - ) + return kube.get_secret_info(namespace=namespace, name=name) except Exception as ex: logger.error(str(ex)) return None -def get_llm_qa_retry_count_in_k8s_configmap( - namespace, - config_map_name -): + +def get_llm_qa_retry_count_in_k8s_configmap(namespace, config_map_name): """Get the llm QA retry count in the configmap. - + namespace: namespace; config_map_name: config map name """ @@ -117,19 +89,22 @@ def get_llm_qa_retry_count_in_k8s_configmap( kube = client.KubeEnv() config_map = kube.read_namespaced_config_map( - namespace=namespace, - name=config_map_name + namespace=namespace, name=config_map_name ) - config = config_map.data.get('dataprocess') - + config = config_map.data.get("dataprocess") + json_data = yaml.safe_load(config) - return json_data['llm']['qa_retry_count'] + return json_data["llm"]["qa_retry_count"] except Exception as ex: - logger.error(''.join([ - f"Can not the llm QA retry count. The error is: \n", - f"{traceback.format_exc()}\n" - ])) - + logger.error( + "".join( + [ + f"Can not the llm QA retry count. The error is: \n", + f"{traceback.format_exc()}\n", + ] + ) + ) + return None diff --git a/pypi/data-processing/src/kube/postgresql_cr.py b/pypi/data-processing/src/kube/postgresql_cr.py index bc2606d75..b22946dd7 100644 --- a/pypi/data-processing/src/kube/postgresql_cr.py +++ b/pypi/data-processing/src/kube/postgresql_cr.py @@ -22,12 +22,9 @@ logger = logging.getLogger(__name__) -def get_postgresql_config_in_k8s_configmap( - namespace, - config_map_name -): +def get_postgresql_config_in_k8s_configmap(namespace, config_map_name): """Get the PostgreSQL config info in the configmap. - + namespace: namespace; config_map_name: config map name """ @@ -35,19 +32,22 @@ def get_postgresql_config_in_k8s_configmap( kube = client.KubeEnv() config_map = kube.read_namespaced_config_map( - namespace=namespace, - name=config_map_name + namespace=namespace, name=config_map_name ) - config = config_map.data.get('dataprocess') - + config = config_map.data.get("dataprocess") + json_data = yaml.safe_load(config) - return json_data['postgresql'] + return json_data["postgresql"] except Exception as ex: - logger.error(''.join([ - f"Can not get the PostgreSQL config info. The error is: \n", - f"{traceback.format_exc()}\n" - ])) - - return None \ No newline at end of file + logger.error( + "".join( + [ + f"Can not get the PostgreSQL config info. The error is: \n", + f"{traceback.format_exc()}\n", + ] + ) + ) + + return None diff --git a/pypi/data-processing/src/llm_api_service/base_qa_provider.py b/pypi/data-processing/src/llm_api_service/base_qa_provider.py index 02450eba4..40a5ffbb6 100644 --- a/pypi/data-processing/src/llm_api_service/base_qa_provider.py +++ b/pypi/data-processing/src/llm_api_service/base_qa_provider.py @@ -13,20 +13,16 @@ # limitations under the License. - from abc import ABC, abstractmethod class BaseQAProvider(ABC): """The Base class for the QA provider.""" + @abstractmethod - def generate_qa_list( - self, - text, - prompt_template=None - ): + def generate_qa_list(self, text, prompt_template=None): """Generate the QA list. - + Parameters ---------- text @@ -34,4 +30,3 @@ def generate_qa_list( prompt_template the prompt template """ - diff --git a/pypi/data-processing/src/llm_api_service/qa_provider_open_ai.py b/pypi/data-processing/src/llm_api_service/qa_provider_open_ai.py index cefb1c726..48be37020 100644 --- a/pypi/data-processing/src/llm_api_service/qa_provider_open_ai.py +++ b/pypi/data-processing/src/llm_api_service/qa_provider_open_ai.py @@ -18,50 +18,39 @@ import time import traceback +from common import log_tag_const +from common.config import config from langchain import LLMChain from langchain.chat_models import ChatOpenAI from langchain.prompts.chat import (ChatPromptTemplate, HumanMessagePromptTemplate) - -from common import log_tag_const -from common.config import config from llm_prompt_template import llm_prompt from .base_qa_provider import BaseQAProvider logger = logging.getLogger(__name__) + class QAProviderOpenAI(BaseQAProvider): """The QA provider is used by open ai.""" - - def __init__( - self, - api_key, - base_url, - model, - temperature=None, - max_tokens=None - ): + + def __init__(self, api_key, base_url, model, temperature=None, max_tokens=None): if temperature is None: temperature = "0.8" if max_tokens is None: max_tokens = "512" self.llm = ChatOpenAI( - openai_api_key=api_key, + openai_api_key=api_key, base_url=base_url, model=model, temperature=float(temperature), - max_tokens=int(max_tokens) - ) - - def generate_qa_list( - self, - text, - prompt_template=None - ): + max_tokens=int(max_tokens), + ) + + def generate_qa_list(self, text, prompt_template=None): """Generate the QA list. - + Parameters ---------- text @@ -71,25 +60,26 @@ def generate_qa_list( """ if prompt_template is None: prompt_template = llm_prompt.get_default_prompt_template() - + human_message_prompt = HumanMessagePromptTemplate.from_template(prompt_template) prompt = ChatPromptTemplate.from_messages([human_message_prompt]) - llm_chain = LLMChain( - prompt=prompt, - llm=self.llm - ) + llm_chain = LLMChain(prompt=prompt, llm=self.llm) result = [] status = 200 - message = '' + message = "" invoke_count = 0 while True: try: if invoke_count >= int(config.llm_qa_retry_count): - logger.error(''.join([ - f"{log_tag_const.OPEN_AI} Cannot access the open ai service.\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.OPEN_AI} Cannot access the open ai service.\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) status = 1000 break @@ -99,28 +89,22 @@ def generate_qa_list( if len(result) > 0: break else: - logger.warn('failed to get QA list, wait for 10 seconds and retry') - time.sleep(10) # sleep 10 seconds + logger.warn( + "failed to get QA list, wait for 10 seconds and retry" + ) + time.sleep(10) # sleep 10 seconds invoke_count += 1 - message = '模型调用成功,生成的QA格式不对,请更换prompt' + message = "模型调用成功,生成的QA格式不对,请更换prompt" except Exception as ex: time.sleep(10) invoke_count += 1 - message = '调用本地模型失败,请检查模型是否可用' - - return { - 'status': status, - 'message': message, - 'data': result - } - - - def __get_qa_list_from_response( - self, - response - ): + message = "调用本地模型失败,请检查模型是否可用" + + return {"status": status, "message": message, "data": result} + + def __get_qa_list_from_response(self, response): """Get the QA list from the response. - + Notice: There are some problems in the local OpenAI service. Some time it cannot return the correct question and answer list. @@ -131,28 +115,26 @@ def __get_qa_list_from_response( """ result = [] try: - pattern = re.compile(r'Q\d+:(\s*)(.*?)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q|$)') - + pattern = re.compile(r"Q\d+:(\s*)(.*?)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q|$)") # 移除换行符 - response_text = response.replace('\\n', '') + response_text = response.replace("\\n", "") matches = pattern.findall(response_text) for match in matches: q = match[1] a = match[4] if q and a: - a = re.sub(r'[\n]', '', a).strip() + a = re.sub(r"[\n]", "", a).strip() result.append([q, a]) except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.OPEN_AI} 从结果中提取QA失败\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) - - return result - - + logger.error( + "".join( + [ + f"{log_tag_const.OPEN_AI} 从结果中提取QA失败\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) - - + return result diff --git a/pypi/data-processing/src/llm_api_service/qa_provider_zhi_pu_ai_online.py b/pypi/data-processing/src/llm_api_service/qa_provider_zhi_pu_ai_online.py index 8ca966059..6ea33ee7a 100644 --- a/pypi/data-processing/src/llm_api_service/qa_provider_zhi_pu_ai_online.py +++ b/pypi/data-processing/src/llm_api_service/qa_provider_zhi_pu_ai_online.py @@ -19,7 +19,6 @@ import traceback import zhipuai - from common import const, log_tag_const from common.config import config from llm_prompt_template import llm_prompt @@ -35,17 +34,11 @@ class QAProviderZhiPuAIOnline(BaseQAProvider): def __init__(self, api_key=None): zhipuai.api_key = api_key - def generate_qa_list( - self, - text, - model, - prompt_template=None, - top_p=None, - temperature=None + self, text, model, prompt_template=None, top_p=None, temperature=None ): """Generate the QA list. - + Parameters ---------- text @@ -60,27 +53,28 @@ def generate_qa_list( if temperature is None: temperature = "0.8" - content = prompt_template.format( - text=text - ) - + content = prompt_template.format(text=text) + result = [] status = 200 - message = '' + message = "" invoke_count = 0 - wait_seconds = const.llm_wait_seconds + wait_seconds = const.LLM_WAIT_SECONDS while True: - logger.debug(''.join([ - f"{log_tag_const.ZHI_PU_AI} content.\n", - f"{content}\n" - ])) + logger.debug( + "".join([f"{log_tag_const.ZHI_PU_AI} content.\n", f"{content}\n"]) + ) try: if invoke_count >= int(config.llm_qa_retry_count): - logger.error(''.join([ - f"{log_tag_const.ZHI_PU_AI} Cannot access the open ai service.\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.ZHI_PU_AI} Cannot access the open ai service.\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) status = 1000 break @@ -91,45 +85,50 @@ def generate_qa_list( top_p=float(top_p), temperature=float(temperature), ) - if response['success']: + if response["success"]: result = self.__format_response_to_qa_list(response) if len(result) > 0: break else: - logger.warn(f"failed to get QA list, wait for {wait_seconds} seconds and retry") - time.sleep(wait_seconds) # sleep 120 seconds + logger.warn( + f"failed to get QA list, wait for {wait_seconds} seconds and retry" + ) + time.sleep(wait_seconds) # sleep 120 seconds invoke_count += 1 - message = '模型调用成功,生成的QA格式不对,请更换prompt' + message = "模型调用成功,生成的QA格式不对,请更换prompt" else: - logger.error(''.join([ - f"{log_tag_const.ZHI_PU_AI} Cannot access the ZhiPuAI service.\n", - f"The error is: \n{response['msg']}\n" - ])) - logger.warn(f"zhipuai request failed, wait for {wait_seconds} seconds and retry") - time.sleep(wait_seconds) # sleep 120 seconds + logger.error( + "".join( + [ + f"{log_tag_const.ZHI_PU_AI} Cannot access the ZhiPuAI service.\n", + f"The error is: \n{response['msg']}\n", + ] + ) + ) + logger.warn( + f"zhipuai request failed, wait for {wait_seconds} seconds and retry" + ) + time.sleep(wait_seconds) # sleep 120 seconds invoke_count += 1 - message = '模型调用失败,失败原因: ' + response['msg'] + message = "模型调用失败,失败原因: " + response["msg"] except Exception as ex: - logger.warn(f"zhipuai request exception, wait for {wait_seconds} seconds and retry") + logger.warn( + f"zhipuai request exception, wait for {wait_seconds} seconds and retry" + ) time.sleep(wait_seconds) invoke_count += 1 - message = '模型调用失败,请检查模型是否可用!' - - return { - 'status': status, - 'message': message, - 'data': result - } + message = "模型调用失败,请检查模型是否可用!" + return {"status": status, "message": message, "data": result} def __format_response_to_qa_list(self, response): """Format the response to the QA list.""" - text = response['data']['choices'][0]['content'] + text = response["data"]["choices"][0]["content"] result = [] try: - pattern = re.compile(r'Q\d+:(\s*)(.*?)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q|$)') + pattern = re.compile(r"Q\d+:(\s*)(.*?)(\s*)A\d+:(\s*)([\s\S]*?)(?=Q|$)") # 移除换行符 - text = text.replace('\\n', '') + text = text.replace("\\n", "") matches = pattern.findall(text) for match in matches: @@ -138,10 +137,13 @@ def __format_response_to_qa_list(self, response): if q and a: result.append([q, a]) except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.ZHI_PU_AI} 从结果中提取QA失败\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.ZHI_PU_AI} 从结果中提取QA失败\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) return result - \ No newline at end of file diff --git a/pypi/data-processing/src/llm_prompt_template/llm_prompt.py b/pypi/data-processing/src/llm_prompt_template/llm_prompt.py index d5a383672..35b3942e1 100644 --- a/pypi/data-processing/src/llm_prompt_template/llm_prompt.py +++ b/pypi/data-processing/src/llm_prompt_template/llm_prompt.py @@ -19,5 +19,5 @@ def get_default_prompt_template(): 请将上述内容按照问答的方式,提出不超过 25 个问题,并给出每个问题的答案,每个问题必须有 Q 和对应的 A,并严格按照以下方式展示: Q1: 问题。\n A1: 答案。\n Q2: 问题 \n A2: 答案\n 注意,尽可能多的提出问题,但是 Q 不要重复,也不要出现只有 Q 没有 A 的情况。 """ - - return prompt_template \ No newline at end of file + + return prompt_template diff --git a/pypi/data-processing/src/parallel/thread_parallel.py b/pypi/data-processing/src/parallel/thread_parallel.py index 421984140..829d4e3df 100644 --- a/pypi/data-processing/src/parallel/thread_parallel.py +++ b/pypi/data-processing/src/parallel/thread_parallel.py @@ -23,25 +23,21 @@ def run_async_background_task(task_creator, task_name): """Run a async background task with a new thread. - + task_creator: a function to run the background task; task_name: the task name which is use to identify the different task; """ loop = asyncio.new_event_loop() - thread = threading.Thread( - target=task_creator, - args=(loop, ), - name=task_name - ) - thread.start() - - logger.debug(''.join([ - f"{log_tag_const.THREADING} Start a new thread.\n", - f"thread name: {task_name}\n", - f"thread id: {thread.ident}" - ])) - - - - + thread = threading.Thread(target=task_creator, args=(loop,), name=task_name) + thread.start() + + logger.debug( + "".join( + [ + f"{log_tag_const.THREADING} Start a new thread.\n", + f"thread name: {task_name}\n", + f"thread id: {thread.ident}", + ] + ) + ) diff --git a/pypi/data-processing/src/server.py b/pypi/data-processing/src/server.py index 0b168d84b..963dd3753 100644 --- a/pypi/data-processing/src/server.py +++ b/pypi/data-processing/src/server.py @@ -13,63 +13,63 @@ # limitations under the License. -import asyncio import logging import time import psycopg2 -from sanic import Sanic -from sanic.response import json -from sanic_cors import CORS - from common import log_tag_const from common.config import config from controller import data_process_controller from database_clients import postgresql_pool_client +from sanic import Sanic +from sanic_cors import CORS from utils import log_utils, sanic_utils # Initialize the log config -log_utils.init_config( - source_type='manipulate_server', - log_dir="log" -) +log_utils.init_config(source_type="manipulate_server", log_dir="log") -logger = logging.getLogger('manipulate_server') +logger = logging.getLogger("manipulate_server") -app = Sanic(name='data_manipulate') +app = Sanic(name="data_manipulate") CORS(app) app.error_handler = sanic_utils.CustomErrorHandler() -@app.middleware('request') +@app.middleware("request") async def request_start_time_middleware(request): """Middleware to record request start time and status code""" request.ctx.start_time = time.time() -@app.middleware('response') +@app.middleware("response") async def request_processing_time_middleware(request, response): """Middleware to calculate and log request processing time and status code""" processing_time = time.time() - request.ctx.start_time - logger.debug(''.join([ - f"{log_tag_const.WEB_SERVER_ACCESS} {request.method.lower()} {request.url} " - f"{response.status} {processing_time:.4f} seconds" - ])) + logger.debug( + "".join( + [ + f"{log_tag_const.WEB_SERVER_ACCESS} {request.method.lower()} {request.url} " + f"{response.status} {processing_time:.4f} seconds" + ] + ) + ) return response -@app.listener('before_server_start') -async def init_web_server(app, loop): - app.config['REQUEST_MAX_SIZE'] = 1024 * 1024 * 1024 # 1G - app.config['REQUEST_TIMEOUT'] = 60 * 60 * 60 - app.config['RESPONSE_TIMEOUT'] = 60 * 60 * 60 - app.config['KEEP_ALIVE_TIMEOUT'] = 60 * 60 * 60 - app.config['conn_pool'] = postgresql_pool_client.get_pool(_create_database_connection) +@app.listener("before_server_start") +async def init_web_server(app, _): + app.config["REQUEST_MAX_SIZE"] = 1024 * 1024 * 1024 # 1G + app.config["REQUEST_TIMEOUT"] = 60 * 60 * 60 + app.config["RESPONSE_TIMEOUT"] = 60 * 60 * 60 + app.config["KEEP_ALIVE_TIMEOUT"] = 60 * 60 * 60 + app.config["conn_pool"] = postgresql_pool_client.get_pool( + _create_database_connection + ) -@app.listener('after_server_stop') -async def shutdown_web_server(app, loop): - postgresql_pool_client.release_pool(app.config['conn_pool']) +@app.listener("after_server_stop") +async def shutdown_web_server(app, _): + postgresql_pool_client.release_pool(app.config["conn_pool"]) app.blueprint(data_process_controller.data_process) @@ -78,19 +78,13 @@ async def shutdown_web_server(app, loop): def _create_database_connection(): """Create a database connection.""" return psycopg2.connect( - host=config.pg_host, - port=config.pg_port, - user=config.pg_user, - password=config.pg_password, - database=config.pg_database - ) - - + host=config.pg_host, + port=config.pg_port, + user=config.pg_user, + password=config.pg_password, + database=config.pg_database, + ) -if __name__ == '__main__': - app.run(host='0.0.0.0', - port=28888, - access_log=False, - debug=False, - workers=2) +if __name__ == "__main__": + app.run(host="0.0.0.0", port=28888, access_log=False, debug=False, workers=2) diff --git a/pypi/data-processing/src/service/data_process_service.py b/pypi/data-processing/src/service/data_process_service.py index 6939ed8b1..e1b2c3c06 100644 --- a/pypi/data-processing/src/service/data_process_service.py +++ b/pypi/data-processing/src/service/data_process_service.py @@ -18,7 +18,6 @@ import traceback import ulid - from common import log_tag_const from data_store_process import minio_store_process from database_operate import (data_process_db_operate, @@ -35,28 +34,19 @@ logger = logging.getLogger(__name__) -def list_by_page( - req_json, - pool -): +def list_by_page(req_json, pool): """Get the list data for data processing by page""" return data_process_db_operate.list_by_page(req_json, pool=pool) -def list_by_count( - req_json, - pool -): +def list_by_count(req_json, pool): """Get count for the list data processing with page""" return data_process_db_operate.list_by_count(req_json, pool=pool) -def add( - req_json, - pool -): +def add(req_json, pool): """Add a new data process task. - + req_json is a dictionary object. for example: { "name": "小T_test_0201", @@ -73,51 +63,43 @@ def add( } ], "data_process_config_info": [] - } + } pool: database connection pool. """ id = ulid.ulid() - res = data_process_db_operate.add( - req_json, - pool=pool, - id=id - ) + res = data_process_db_operate.add(req_json, pool=pool, id=id) - if res['status'] == 200: + if res["status"] == 200: try: - async def async_text_manipulate( - req_json, - pool, - id - ): + async def async_text_manipulate(req_json, pool, id): minio_store_process.text_manipulate(req_json, pool=pool, id=id) def execute_text_manipulate_task(loop): asyncio.set_event_loop(loop) - loop.run_until_complete(async_text_manipulate(req_json, pool=pool, id=id)) + loop.run_until_complete( + async_text_manipulate(req_json, pool=pool, id=id) + ) thread_parallel.run_async_background_task( - execute_text_manipulate_task, - 'execute text manipuate task' + execute_text_manipulate_task, "execute text manipuate task" ) except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} There is an error when ", - f"start to run the minio store process.\n", - f"{traceback.format_exc()}\n" - ])) - - - + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} There is an error when ", + f"start to run the minio store process.\n", + f"{traceback.format_exc()}\n", + ] + ) + ) + return res -def delete_by_id( - req_json, - pool -): +def delete_by_id(req_json, pool): """Delete a record with id""" # 删除需要在详情中预览的信息 data_process_detail_db_operate.delete_transform_by_task_id(req_json, pool=pool) @@ -136,10 +118,7 @@ def delete_by_id( return data_process_db_operate.delete_by_id(req_json, pool=pool) -def info_by_id( - req_json, - pool -): +def info_by_id(req_json, pool): """Get a detail info with id. req_json is a dictionary object. for example: @@ -147,142 +126,85 @@ def info_by_id( "id": "01HGWBE48DT3ADE9ZKA62SW4WS" } """ - id = req_json['id'] + id = req_json["id"] data = _get_default_data_for_detail() - _get_and_set_basic_detail_info( - data, - task_id=id, - conn_pool=pool - ) + _get_and_set_basic_detail_info(data, task_id=id, conn_pool=pool) - if data['id'] == '': - return { - 'status': 200, - 'message': '', - 'data': data - } + if data["id"] == "": + return {"status": 200, "message": "", "data": data} - process_cofig_map = _convert_config_info_to_map(data.get('data_process_config_info')) + process_cofig_map = _convert_config_info_to_map( + data.get("data_process_config_info") + ) config_map_for_result = {} _set_basic_info_for_config_map_for_result( - config_map_for_result, - process_cofig_map, - task_id=id, - conn_pool=pool + config_map_for_result, process_cofig_map, task_id=id, conn_pool=pool ) _set_children_info_for_config_map_for_result( - config_map_for_result, - process_cofig_map, - task_id=id, - conn_pool=pool + config_map_for_result, process_cofig_map, task_id=id, conn_pool=pool ) # convert the config resule from map to list config_list_for_result = [] for value in config_map_for_result.values(): config_list_for_result.append(value) - - data['config'] = config_list_for_result + + data["config"] = config_list_for_result logger.debug(f"{log_tag_const.DATA_PROCESS_DETAIL} The response data is: \n{data}") - return { - 'status': 200, - 'message': '', - 'data': data - } + return {"status": 200, "message": "", "data": data} -def check_task_name( - req_json, - pool -): + +def check_task_name(req_json, pool): # 判断名称是否已存在 - count = data_process_db_operate.count_by_name( - req_json, - pool=pool - ) + count = data_process_db_operate.count_by_name(req_json, pool=pool) - if count.get('data') > 0: - return { - 'status': 1000, - 'message': '任务名称已存在,请重新输入!', - 'data': '' - } - - return { - 'status': 200, - 'message': '', - 'data': '' - } + if count.get("data") > 0: + return {"status": 1000, "message": "任务名称已存在,请重新输入!", "data": ""} + return {"status": 200, "message": "", "data": ""} -def get_log_info( - req_json, - pool -): + +def get_log_info(req_json, pool): # 获取任务日志信息 - log_list = data_process_stage_log_db_operate.list_by_task_id( - req_json, - pool=pool - ) + log_list = data_process_stage_log_db_operate.list_by_task_id(req_json, pool=pool) log_dict = [] - for log_info in log_list.get('data'): - log_dict.append(log_info.get('stage_detail')) + for log_info in log_list.get("data"): + log_dict.append(log_info.get("stage_detail")) - separator = '=' * 100 - log_detail = ('\n' + separator + '\n').join(log_dict) - - return { - 'status': 200, - 'message': '', - 'data': log_detail - } + separator = "=" * 100 + log_detail = ("\n" + separator + "\n").join(log_dict) + return {"status": 200, "message": "", "data": log_detail} -def get_log_by_file_name( - req_json, - pool -): + +def get_log_by_file_name(req_json, pool): try: stage_log_info = data_process_stage_log_db_operate.info_by_stage_and_file_name( - req_json, - pool=pool + req_json, pool=pool ) - if stage_log_info.get('status') != 200: + if stage_log_info.get("status") != 200: return stage_log_info - stage_detail = stage_log_info.get('data')[0].get('stage_detail') - - return { - 'status': 200, - 'message': '', - 'data': stage_detail - } + stage_detail = stage_log_info.get("data")[0].get("stage_detail") + + return {"status": 200, "message": "", "data": stage_detail} except Exception as ex: - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} -def retry( - req_json, - pool -): +def retry(req_json, pool): """When a task fails, attempt a retry.""" try: logger.debug(f"{log_tag_const.DATA_PROCESS_SERVICE} The task retry start") - async def async_text_manipulate_retry( - req_json, - pool - ): + async def async_text_manipulate_retry(req_json, pool): minio_store_process.text_manipulate_retry(req_json, pool=pool) def execute_text_manipulate_task_retry(loop): @@ -290,28 +212,19 @@ def execute_text_manipulate_task_retry(loop): loop.run_until_complete(async_text_manipulate_retry(req_json, pool=pool)) thread_parallel.run_async_background_task( - execute_text_manipulate_task_retry, - 'execute text manipuate task retry' + execute_text_manipulate_task_retry, "execute text manipuate task retry" ) - return { - 'status': 200, - 'message': '任务开始重试!', - 'data': '' - } + return {"status": 200, "message": "任务开始重试!", "data": ""} except Exception as ex: - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} def _get_default_data_for_detail(): """Get the data for the detail""" return { "id": "", - "name": "", + "name": "", "status": "", "file_type": "", "pre_dataset_name": "", @@ -319,21 +232,17 @@ def _get_default_data_for_detail(): "post_dataset_name": "", "post_dataset_version": "", "file_num": 0, - "start_time": '', - "end_time": '', - "create_user": '', + "start_time": "", + "end_time": "", + "create_user": "", "data_process_config_info": [], - "config": [] + "config": [], } -def _get_and_set_basic_detail_info( - from_result, - task_id, - conn_pool -): +def _get_and_set_basic_detail_info(from_result, task_id, conn_pool): """Get and set the basic detail info. - + from_result: the from result, it's content will be changed; task_id: task id; @@ -341,293 +250,290 @@ def _get_and_set_basic_detail_info( """ # step 1 # Get the detail info from the database. - detail_info_params = { - 'id': task_id - } + detail_info_params = {"id": task_id} detail_info_res = data_process_db_operate.info_by_id( - detail_info_params, - pool=conn_pool + detail_info_params, pool=conn_pool ) - if detail_info_res['status'] == 200 and len(detail_info_res['data']) > 0: - detail_info_data = detail_info_res['data'][0] + if detail_info_res["status"] == 200 and len(detail_info_res["data"]) > 0: + detail_info_data = detail_info_res["data"][0] file_num = 0 - if detail_info_data.get('file_names'): - file_num = len(detail_info_data['file_names']) - - from_result['id'] = task_id - from_result['name'] = detail_info_data['name'] - from_result['status'] = detail_info_data['status'] - from_result['file_type'] = detail_info_data['file_type'] - from_result['file_num'] = file_num - from_result['pre_dataset_name'] = detail_info_data['pre_data_set_name'] - from_result['pre_dataset_version'] = detail_info_data['pre_data_set_version'] - from_result['post_dataset_name'] = detail_info_data['post_data_set_name'] - from_result['post_dataset_version'] = detail_info_data['post_data_set_version'] - from_result['start_time'] = detail_info_data['start_datetime'] - from_result['end_time'] = detail_info_data['end_datetime'] - from_result['creator'] = detail_info_data['create_user'] - from_result['error_msg'] = detail_info_data['error_msg'] - from_result['data_process_config_info'] = detail_info_data['data_process_config_info'] + if detail_info_data.get("file_names"): + file_num = len(detail_info_data["file_names"]) + + from_result["id"] = task_id + from_result["name"] = detail_info_data["name"] + from_result["status"] = detail_info_data["status"] + from_result["file_type"] = detail_info_data["file_type"] + from_result["file_num"] = file_num + from_result["pre_dataset_name"] = detail_info_data["pre_data_set_name"] + from_result["pre_dataset_version"] = detail_info_data["pre_data_set_version"] + from_result["post_dataset_name"] = detail_info_data["post_data_set_name"] + from_result["post_dataset_version"] = detail_info_data["post_data_set_version"] + from_result["start_time"] = detail_info_data["start_datetime"] + from_result["end_time"] = detail_info_data["end_datetime"] + from_result["creator"] = detail_info_data["create_user"] + from_result["error_msg"] = detail_info_data["error_msg"] + from_result["data_process_config_info"] = detail_info_data[ + "data_process_config_info" + ] else: - from_result['id'] = '' + from_result["id"] = "" def _convert_config_info_to_map(config_info_list): """Convert the config info to map. - config_info_list: a list for example - [ - { - "type": "qa_split" - }, - { - "type": "remove_invisible_characters" - }, - { - "type": "space_standardization" - }, - { - "type": "remove_email" - } - ] + config_info_list: a list for example + [ + { + "type": "qa_split" + }, + { + "type": "remove_invisible_characters" + }, + { + "type": "space_standardization" + }, + { + "type": "remove_email" + } + ] """ result = {} for item in config_info_list: - result[item['type']] = item + result[item["type"]] = item return result def _set_basic_info_for_config_map_for_result( - from_result, - process_cofig_map, - task_id, - conn_pool + from_result, process_cofig_map, task_id, conn_pool ): """Set basic info for the config map for result. - + from_result: the from result, it's content will be changed. - process_config_map: process config map + process_config_map: process config map """ # chunk processing - if process_cofig_map.get('qa_split'): - if from_result.get('chunk_processing') is None: - from_result['chunk_processing'] = { - 'name': 'chunk_processing', - 'description': '拆分处理', - 'file_num': _get_qa_process_file_num( - task_id=task_id, - conn_pool=conn_pool + if process_cofig_map.get("qa_split"): + if from_result.get("chunk_processing") is None: + from_result["chunk_processing"] = { + "name": "chunk_processing", + "description": "拆分处理", + "file_num": _get_qa_process_file_num( + task_id=task_id, conn_pool=conn_pool ), - 'status': _get_qa_split_status( - task_id=task_id, - conn_pool=conn_pool - ), - 'children': [] + "status": _get_qa_split_status(task_id=task_id, conn_pool=conn_pool), + "children": [], } # data clean - if process_cofig_map.get('remove_invisible_characters') or \ - process_cofig_map.get('space_standardization') or \ - process_cofig_map.get('remove_garbled_text') or \ - process_cofig_map.get('traditional_to_simplified') or \ - process_cofig_map.get('remove_html_tag') or \ - process_cofig_map.get('remove_emojis'): - if from_result.get('clean') is None: - from_result['clean'] = { - 'name': 'clean', - 'description': '异常清洗配置', - 'file_num': _get_clean_process_file_num( - task_id=task_id, - conn_pool=conn_pool + if ( + process_cofig_map.get("remove_invisible_characters") + or process_cofig_map.get("space_standardization") + or process_cofig_map.get("remove_garbled_text") + or process_cofig_map.get("traditional_to_simplified") + or process_cofig_map.get("remove_html_tag") + or process_cofig_map.get("remove_emojis") + ): + if from_result.get("clean") is None: + from_result["clean"] = { + "name": "clean", + "description": "异常清洗配置", + "file_num": _get_clean_process_file_num( + task_id=task_id, conn_pool=conn_pool ), - 'status': 'success', - 'children': [] + "status": "success", + "children": [], } - + # remove privacy - if process_cofig_map.get('remove_email') or \ - process_cofig_map.get('remove_ip_address') or \ - process_cofig_map.get('remove_number'): - if from_result.get('privacy') is None: - from_result['privacy'] = { - 'name': 'privacy', - 'description': '数据隐私处理', - 'file_num': _get_privacy_process_file_num( - task_id=task_id, - conn_pool=conn_pool + if ( + process_cofig_map.get("remove_email") + or process_cofig_map.get("remove_ip_address") + or process_cofig_map.get("remove_number") + ): + if from_result.get("privacy") is None: + from_result["privacy"] = { + "name": "privacy", + "description": "数据隐私处理", + "file_num": _get_privacy_process_file_num( + task_id=task_id, conn_pool=conn_pool ), - 'status': 'success', - 'children': [] + "status": "success", + "children": [], } def _set_children_info_for_config_map_for_result( - from_result, - process_cofig_map, - task_id, - conn_pool + from_result, process_cofig_map, task_id, conn_pool ): """Set child list for the config for result - + from_result: the from result, it's content will be changed. process_config_map: process config map; task_id: task id, - conn_pool: database connection pool + conn_pool: database connection pool """ # insert the qa list - if process_cofig_map.get('qa_split'): - from_result['chunk_processing']['children'].append({ - 'name': 'qa_split', - 'enable': 'true', - 'zh_name': 'QA拆分', - 'description': '根据文件中的文档内容,自动将文件做 QA 拆分处理。', - 'llm_config': process_cofig_map.get('qa_split').get('llm_config'), - 'preview': _get_qa_list_preview( - task_id=task_id, - conn_pool=conn_pool - ), - 'file_progress': _get_file_progress( - task_id=task_id, - conn_pool=conn_pool - ) - }) + if process_cofig_map.get("qa_split"): + from_result["chunk_processing"]["children"].append( + { + "name": "qa_split", + "enable": "true", + "zh_name": "QA拆分", + "description": "根据文件中的文档内容,自动将文件做 QA 拆分处理。", + "llm_config": process_cofig_map.get("qa_split").get("llm_config"), + "preview": _get_qa_list_preview(task_id=task_id, conn_pool=conn_pool), + "file_progress": _get_file_progress( + task_id=task_id, conn_pool=conn_pool + ), + } + ) # remove invisible characters - if process_cofig_map.get('remove_invisible_characters'): - from_result['clean']['children'].append({ - 'name': 'remove_invisible_characters', - 'enable': 'true', - 'zh_name': '移除不可见字符', - 'description': '移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围', - 'preview': _get_transform_preview_list( - task_id=task_id, - transform_type='remove_invisible_characters', - conn_pool=conn_pool - ) - }) + if process_cofig_map.get("remove_invisible_characters"): + from_result["clean"]["children"].append( + { + "name": "remove_invisible_characters", + "enable": "true", + "zh_name": "移除不可见字符", + "description": "移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围", + "preview": _get_transform_preview_list( + task_id=task_id, + transform_type="remove_invisible_characters", + conn_pool=conn_pool, + ), + } + ) # space standardization - if process_cofig_map.get('space_standardization'): - from_result['clean']['children'].append({ - 'name': 'space_standardization', - 'enable': 'true', - 'zh_name': '空格处理', - 'description': '将不同的unicode空格比如u2008, 转成正常的空格', - 'preview': _get_transform_preview_list( - task_id=task_id, - transform_type='space_standardization', - conn_pool=conn_pool - ) - }) + if process_cofig_map.get("space_standardization"): + from_result["clean"]["children"].append( + { + "name": "space_standardization", + "enable": "true", + "zh_name": "空格处理", + "description": "将不同的unicode空格比如u2008, 转成正常的空格", + "preview": _get_transform_preview_list( + task_id=task_id, + transform_type="space_standardization", + conn_pool=conn_pool, + ), + } + ) # remove garbled text - if process_cofig_map.get('remove_garbled_text'): - from_result['clean']['children'].append({ - 'name': 'remove_garbled_text', - 'enable': 'true', - 'zh_name': '去除乱码', - 'description': '去除乱码和无意义的unicode', - 'preview': _get_transform_preview_list( - task_id=task_id, - transform_type='remove_garbled_text', - conn_pool=conn_pool - ) - }) - + if process_cofig_map.get("remove_garbled_text"): + from_result["clean"]["children"].append( + { + "name": "remove_garbled_text", + "enable": "true", + "zh_name": "去除乱码", + "description": "去除乱码和无意义的unicode", + "preview": _get_transform_preview_list( + task_id=task_id, + transform_type="remove_garbled_text", + conn_pool=conn_pool, + ), + } + ) + # traditional to simplified - if process_cofig_map.get('traditional_to_simplified'): - from_result['clean']['children'].append({ - 'name': 'traditional_to_simplified', - 'enable': 'true', - 'zh_name': '繁转简', - 'description': '繁体转简体,如“不經意,妳的笑容”清洗成“不经意,你的笑容”', - 'preview': _get_transform_preview_list( - task_id=task_id, - transform_type='traditional_to_simplified', - conn_pool=conn_pool - ) - }) + if process_cofig_map.get("traditional_to_simplified"): + from_result["clean"]["children"].append( + { + "name": "traditional_to_simplified", + "enable": "true", + "zh_name": "繁转简", + "description": "繁体转简体,如“不經意,妳的笑容”清洗成“不经意,你的笑容”", + "preview": _get_transform_preview_list( + task_id=task_id, + transform_type="traditional_to_simplified", + conn_pool=conn_pool, + ), + } + ) # remove html tag - if process_cofig_map.get('remove_html_tag'): - from_result['clean']['children'].append({ - 'name': 'remove_html_tag', - 'enable': 'true', - 'zh_name': '去除网页标识符', - 'description': '移除文档中的html标签, 如,,

等', - 'preview': _get_transform_preview_list( - task_id=task_id, - transform_type='remove_html_tag', - conn_pool=conn_pool - ) - }) + if process_cofig_map.get("remove_html_tag"): + from_result["clean"]["children"].append( + { + "name": "remove_html_tag", + "enable": "true", + "zh_name": "去除网页标识符", + "description": "移除文档中的html标签, 如,,

等", + "preview": _get_transform_preview_list( + task_id=task_id, + transform_type="remove_html_tag", + conn_pool=conn_pool, + ), + } + ) # remove emojis - if process_cofig_map.get('remove_emojis'): - from_result['clean']['children'].append({ - 'name': 'remove_emojis', - 'enable': 'true', - 'zh_name': '去除表情', - 'description': '去除文档中的表情,如‘🐰’, ‘🧑🏼’等', - 'preview': _get_transform_preview_list( - task_id=task_id, - transform_type='remove_emojis', - conn_pool=conn_pool - ) - }) + if process_cofig_map.get("remove_emojis"): + from_result["clean"]["children"].append( + { + "name": "remove_emojis", + "enable": "true", + "zh_name": "去除表情", + "description": "去除文档中的表情,如‘🐰’, ‘🧑🏼’等", + "preview": _get_transform_preview_list( + task_id=task_id, transform_type="remove_emojis", conn_pool=conn_pool + ), + } + ) # remove email - if process_cofig_map.get('remove_email'): - from_result['privacy']['children'].append({ - 'name': 'remove_email', - 'enable': 'true', - 'zh_name': '去除Email', - 'description': '去除email地址', - 'preview': _get_transform_preview_list( - task_id=task_id, - transform_type='remove_email', - conn_pool=conn_pool - ) - }) + if process_cofig_map.get("remove_email"): + from_result["privacy"]["children"].append( + { + "name": "remove_email", + "enable": "true", + "zh_name": "去除Email", + "description": "去除email地址", + "preview": _get_transform_preview_list( + task_id=task_id, transform_type="remove_email", conn_pool=conn_pool + ), + } + ) # remove ip address - if process_cofig_map.get('remove_ip_address'): - from_result['privacy']['children'].append({ - 'name': 'remove_ip_address', - 'enable': 'true', - 'zh_name': '去除IP地址', - 'description': '去除IPv4 或者 IPv6 地址', - 'preview': _get_transform_preview_list( - task_id=task_id, - transform_type='remove_ip_address', - conn_pool=conn_pool - ) - }) + if process_cofig_map.get("remove_ip_address"): + from_result["privacy"]["children"].append( + { + "name": "remove_ip_address", + "enable": "true", + "zh_name": "去除IP地址", + "description": "去除IPv4 或者 IPv6 地址", + "preview": _get_transform_preview_list( + task_id=task_id, + transform_type="remove_ip_address", + conn_pool=conn_pool, + ), + } + ) # remove number - if process_cofig_map.get('remove_number'): - from_result['privacy']['children'].append({ - 'name': 'remove_number', - 'enable': 'true', - 'zh_name': '去除数字', - 'description': '去除数字和字母数字标识符,如电话号码、信用卡号、十六进制散列等,同时跳过年份和简单数字的实例', - 'preview': _get_transform_preview_list( - task_id=task_id, - transform_type='remove_number', - conn_pool=conn_pool - ) - }) + if process_cofig_map.get("remove_number"): + from_result["privacy"]["children"].append( + { + "name": "remove_number", + "enable": "true", + "zh_name": "去除数字", + "description": "去除数字和字母数字标识符,如电话号码、信用卡号、十六进制散列等,同时跳过年份和简单数字的实例", + "preview": _get_transform_preview_list( + task_id=task_id, transform_type="remove_number", conn_pool=conn_pool + ), + } + ) -def _get_transform_preview_list( - task_id, - transform_type, - conn_pool -): - """"Get transofm preview list. - +def _get_transform_preview_list(task_id, transform_type, conn_pool): + """ "Get transofm preview list. + task_id: task id; transform_type: transform type conn_pool: database connection pool; @@ -635,212 +541,177 @@ def _get_transform_preview_list( transform_preview = [] # step 1 # list file name in transform - list_file_name_params = { - 'task_id': task_id, - 'transform_type': transform_type - } + list_file_name_params = {"task_id": task_id, "transform_type": transform_type} list_file_name_res = data_process_detail_db_operate.list_file_name_for_transform( - list_file_name_params, - pool=conn_pool + list_file_name_params, pool=conn_pool ) - if list_file_name_res['status'] == 200: - for item in list_file_name_res['data']: - transform_preview.append({ - 'file_name': item['file_name'], - 'content': [] - }) + if list_file_name_res["status"] == 200: + for item in list_file_name_res["data"]: + transform_preview.append({"file_name": item["file_name"], "content": []}) # step 2 # iterate the transform preview for item in transform_preview: list_transform_params = { - 'task_id': task_id, - 'transform_type': transform_type, - 'file_name': item['file_name'] + "task_id": task_id, + "transform_type": transform_type, + "file_name": item["file_name"], } - list_transform_res = data_process_detail_db_operate.top_n_list_transform_for_preview( - list_transform_params, - pool=conn_pool + list_transform_res = ( + data_process_detail_db_operate.top_n_list_transform_for_preview( + list_transform_params, pool=conn_pool + ) ) - if list_transform_res['status'] == 200: - for item_transform in list_transform_res['data']: - item['content'].append({ - 'pre': item_transform['pre_content'], - 'post': item_transform['post_content'] - }) - + if list_transform_res["status"] == 200: + for item_transform in list_transform_res["data"]: + item["content"].append( + { + "pre": item_transform["pre_content"], + "post": item_transform["post_content"], + } + ) + return transform_preview -def _get_qa_list_preview( - task_id, - conn_pool -): +def _get_qa_list_preview(task_id, conn_pool): """Get the QA list preview. - + task_id: task od; conn_pool: database connection pool """ - logger.debug(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} Get preview for QA " - ])) + logger.debug("".join([f"{log_tag_const.MINIO_STORE_PROCESS} Get preview for QA "])) qa_list_preview = [] # step 1 # list file name in QA - list_file_name_params = { - 'task_id': task_id, - 'transform_type': 'qa_split' - } - list_file_name_res = data_process_detail_preview_db_operate.list_file_name_by_task_id( - list_file_name_params, - pool=conn_pool + list_file_name_params = {"task_id": task_id, "transform_type": "qa_split"} + list_file_name_res = ( + data_process_detail_preview_db_operate.list_file_name_by_task_id( + list_file_name_params, pool=conn_pool + ) ) - if list_file_name_res['status'] == 200: - for item in list_file_name_res['data']: - qa_list_preview.append({ - 'file_name': item['file_name'], - 'content': [] - }) - + if list_file_name_res["status"] == 200: + for item in list_file_name_res["data"]: + qa_list_preview.append({"file_name": item["file_name"], "content": []}) + # step 2 # iterate the QA list preview - list_qa_params = { - 'task_id': task_id, - 'transform_type': 'qa_split' - } + list_qa_params = {"task_id": task_id, "transform_type": "qa_split"} list_qa_res = data_process_detail_preview_db_operate.list_for_preview( - list_qa_params, - pool=conn_pool + list_qa_params, pool=conn_pool ) for item in qa_list_preview: - for item_qa in list_qa_res['data']: - if item.get('file_name') == item_qa.get('file_name'): - item['content'].append({ - 'pre': item_qa['pre_content'], - 'post': item_qa['post_content'] - }) - + for item_qa in list_qa_res["data"]: + if item.get("file_name") == item_qa.get("file_name"): + item["content"].append( + {"pre": item_qa["pre_content"], "post": item_qa["post_content"]} + ) + return qa_list_preview -def _get_file_progress( - task_id, - conn_pool -): + +def _get_file_progress(task_id, conn_pool): """Get file progress. - + task_id: task id; conn_pool: database connection pool """ # Get the detail info from the database. - detail_info_params = { - 'task_id': task_id - } + detail_info_params = {"task_id": task_id} list_file = data_process_document_db_operate.list_file_by_task_id( - detail_info_params, - pool=conn_pool + detail_info_params, pool=conn_pool ) - return list_file.get('data') + return list_file.get("data") -def _get_qa_split_status( - task_id, - conn_pool -): + +def _get_qa_split_status(task_id, conn_pool): """Get file progress. - + task_id: task id; conn_pool: database connection pool """ # Get the detail info from the database. - status = 'doing' - detail_info_params = { - 'task_id': task_id - } + status = "doing" + detail_info_params = {"task_id": task_id} list_file = data_process_document_db_operate.list_file_by_task_id( - detail_info_params, - pool=conn_pool + detail_info_params, pool=conn_pool ) - if list_file.get('status') != 200 or len(list_file.get('data')) == 0: - return 'fail' - - file_dict = list_file.get('data') + if list_file.get("status") != 200 or len(list_file.get("data")) == 0: + return "fail" + + file_dict = list_file.get("data") # 当所有文件状态都为success,则status为success - all_success = all(item['status'] == 'success' for item in file_dict) + all_success = all(item["status"] == "success" for item in file_dict) if all_success: - return 'success' + return "success" # 当所有文件状态都为not_start,则status为not_start - all_success = all(item['status'] == 'not_start' for item in file_dict) + all_success = all(item["status"] == "not_start" for item in file_dict) if all_success: - return 'not_start' + return "not_start" # 只要有一个文件状态为fail,则status为fail - status_fail = any(item['status'] == 'fail' for item in file_dict) + status_fail = any(item["status"] == "fail" for item in file_dict) if status_fail: - return 'fail' + return "fail" return status -def _get_qa_process_file_num( - task_id, - conn_pool -): - list_file_name_params = { - 'task_id': task_id - } +def _get_qa_process_file_num(task_id, conn_pool): + list_file_name_params = {"task_id": task_id} list_file_name_res = data_process_detail_db_operate.list_file_name_in_qa_by_task_id( - list_file_name_params, - pool=conn_pool + list_file_name_params, pool=conn_pool ) - if list_file_name_res.get('status') == 200: - return len(list_file_name_res.get('data')) + if list_file_name_res.get("status") == 200: + return len(list_file_name_res.get("data")) else: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} Get the number of files processed after QA " - ])) + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} Get the number of files processed after QA " + ] + ) + ) return 0 -def _get_clean_process_file_num( - task_id, - conn_pool -): - list_file_name_params = { - 'task_id': task_id - } +def _get_clean_process_file_num(task_id, conn_pool): + list_file_name_params = {"task_id": task_id} list_file_name_res = data_process_detail_db_operate.list_file_name_for_clean( - list_file_name_params, - pool=conn_pool + list_file_name_params, pool=conn_pool ) - if list_file_name_res.get('status') == 200: - return len(list_file_name_res.get('data')) + if list_file_name_res.get("status") == 200: + return len(list_file_name_res.get("data")) else: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} Get the number of files processed after cleaning " - ])) + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} Get the number of files processed after cleaning " + ] + ) + ) return 0 -def _get_privacy_process_file_num( - task_id, - conn_pool -): - list_file_name_params = { - 'task_id': task_id - } +def _get_privacy_process_file_num(task_id, conn_pool): + list_file_name_params = {"task_id": task_id} list_file_name_res = data_process_detail_db_operate.list_file_name_for_privacy( - list_file_name_params, - pool=conn_pool + list_file_name_params, pool=conn_pool ) - if list_file_name_res.get('status') == 200: - return len(list_file_name_res.get('data')) + if list_file_name_res.get("status") == 200: + return len(list_file_name_res.get("data")) else: - logger.error(''.join([ - f"{log_tag_const.MINIO_STORE_PROCESS} Get the number of files processed after privacy " - ])) + logger.error( + "".join( + [ + f"{log_tag_const.MINIO_STORE_PROCESS} Get the number of files processed after privacy " + ] + ) + ) return 0 diff --git a/pypi/data-processing/src/transform/text/clean_transform.py b/pypi/data-processing/src/transform/text/clean_transform.py index 8deb471ed..0234ab9b3 100644 --- a/pypi/data-processing/src/transform/text/clean_transform.py +++ b/pypi/data-processing/src/transform/text/clean_transform.py @@ -19,28 +19,29 @@ import ftfy import opencc -from selectolax.parser import HTMLParser - from common import log_tag_const, special_characters +from selectolax.parser import HTMLParser logger = logging.getLogger(__name__) def remove_invisible_characters(text): """remove invisible characters. - + text: text; usage input: - “一户一表、水表出户、抄表到户”是指一个家庭用户安装一个计量水表,计量水表安装在住宅的公共部位,供水企业抄表到户,按户计量收费。 + “一户一表、水表出户、抄表到户”是指一个家庭用户安装一个计量水表,计量水表安装在住宅的公共部位,供水企业抄表到户,按户计量收费。 output: “一户一表、水表出户、抄表到户”是指一个家庭用户安装一个计量水表,计量水表安装在住宅的公共部位,供水企业抄表到户,按户计量收费。 """ try: - pattern = r'[\x00-\x1F\x7F-\x9F\xAD\r\t\b\x0B\x1C\x1D\x1E]' - find_pattern = r'[^,。!?,.!?]*[\x00-\x1F\x7F-\x9F\xAD\r\t\b\x0B\x1C\x1D\x1E][^,。!?,.!?]*' - replace_text = '' + pattern = r"[\x00-\x1F\x7F-\x9F\xAD\r\t\b\x0B\x1C\x1D\x1E]" + find_pattern = ( + r"[^,。!?,.!?]*[\x00-\x1F\x7F-\x9F\xAD\r\t\b\x0B\x1C\x1D\x1E][^,。!?,.!?]*" + ) + replace_text = "" clean_text = re.sub(pattern, replace_text, text) @@ -48,45 +49,45 @@ def remove_invisible_characters(text): text=text, pattern=pattern, find_pattern=find_pattern, - replace_text=replace_text + replace_text=replace_text, ) return { - 'status': 200, - 'message': '', - 'data': { - 'clean_data': clean_data, - 'text': clean_text - } + "status": 200, + "message": "", + "data": {"clean_data": clean_data, "text": clean_text}, } except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.CLEAN_TRANSFORM} Execute removing invisible characters failed\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } + logger.error( + "".join( + [ + f"{log_tag_const.CLEAN_TRANSFORM} Execute removing invisible characters failed\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} def space_standardization(text): """space standardization. - + text: text; usage: input: 第一条 灭火是指国家综合性消防救援队、专职消防队依法承担的火灾扑救工作。 - + output: 第一条 灭火是指国家综合性消防救援队、专职消防队依法承担的火灾扑救工作。 """ try: various_whitespaces = special_characters.VARIOUS_WHITESPACES - pattern = '|'.join(re.escape(value) for value in various_whitespaces) - find_pattern = '|'.join(f'[^,。!?,.!?]*{re.escape(value)}[^,。!?,.!?]*' for value in various_whitespaces) - replace_text = ' ' + pattern = "|".join(re.escape(value) for value in various_whitespaces) + find_pattern = "|".join( + f"[^,。!?,.!?]*{re.escape(value)}[^,。!?,.!?]*" + for value in various_whitespaces + ) + replace_text = " " clean_text = re.sub(pattern, replace_text, text) @@ -94,27 +95,25 @@ def space_standardization(text): text=text, pattern=pattern, find_pattern=find_pattern, - replace_text=replace_text + replace_text=replace_text, ) return { - 'status': 200, - 'message': '', - 'data': { - 'clean_data': clean_data, - 'text': clean_text - } + "status": 200, + "message": "", + "data": {"clean_data": clean_data, "text": clean_text}, } except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.CLEAN_TRANSFORM} Executing space standardization failed.\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } + logger.error( + "".join( + [ + f"{log_tag_const.CLEAN_TRANSFORM} Executing space standardization failed.\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} + def remove_garbled_text(text): """remove garbled text. @@ -123,132 +122,114 @@ def remove_garbled_text(text): usage: input: 江苏省滨海县人民法院民事判决书(2015)滨滩商初字第0014号原告孟庆连,男,49岁,居民。委托代理人王成庭,滨海县滨淮法律服务所法律工作者。 — like this one. - + output: 江苏省滨海县人民法院民事判决书(2015)滨滩商初字第0014号原告孟庆连,男,49岁,居民。委托代理人王成庭,滨海县滨淮法律服务所法律工作者。 — like this one. """ try: clean_text = ftfy.fix_text(text) - return { - 'status': 200, - 'message': '', - 'data': { - 'found': 0, - 'text': clean_text - } - } + return {"status": 200, "message": "", "data": {"found": 0, "text": clean_text}} except Exception as ex: error = str(ex) - logger.error(''.join([ - f"{log_tag_const.CLEAN_TRANSFORM} Executing space standardization failed\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.CLEAN_TRANSFORM} Executing space standardization failed\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + + return {"status": 400, "message": error, "data": traceback.format_exc()} - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } def traditional_to_simplified(text): """Traditional Chinese to Simplified Chinese. - + text: text; usage: input: 風暴帶來的暫停使消防員和其他緊急反應人員得以進入禁區進行結構破壞評估。 - + output: 风暴带来的暂停使消防员和其他紧急反应人员得以进入禁区进行结构破坏评估。 """ try: - clean_text = opencc.OpenCC('t2s').convert(text) + clean_text = opencc.OpenCC("t2s").convert(text) - return { - 'status': 200, - 'message': '', - 'data': { - 'found': 0, - 'text': clean_text - } - } + return {"status": 200, "message": "", "data": {"found": 0, "text": clean_text}} except Exception as ex: error = str(ex) - logger.error(''.join([ - f"{log_tag_const.CLEAN_TRANSFORM} Executing Traditional Chinese to Simplified Chinese failed\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.CLEAN_TRANSFORM} Executing Traditional Chinese to Simplified Chinese failed\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } + return {"status": 400, "message": error, "data": traceback.format_exc()} def remove_html_tag(text): """clean html code in text samples. - + text: text; usage: input:

朗播 SAT 学员成绩单分析报告 - + output: 朗播 SAT 学员成绩单分析报告 """ try: - text = text.replace('
  • ', '\n*') - text = text.replace('
  • ', '') - text = text.replace('
      ', '\n*') - text = text.replace('
    ', '') + text = text.replace("
  • ", "\n*") + text = text.replace("
  • ", "") + text = text.replace("
      ", "\n*") + text = text.replace("
    ", "") parser = HTMLParser(text) clean_text = parser.text() - return { - 'status': 200, - 'message': '', - 'data': { - 'found': 0, - 'text': clean_text - } - } + return {"status": 200, "message": "", "data": {"found": 0, "text": clean_text}} except Exception as ex: error = str(ex) - logger.error(''.join([ - f"{log_tag_const.CLEAN_TRANSFORM} Executing clean html code in text samples failed\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.CLEAN_TRANSFORM} Executing clean html code in text samples failed\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } + return {"status": 400, "message": error, "data": traceback.format_exc()} def remove_emojis(text): """remove emojis. - + text: text; usage: input: 这是一段带有表情符号😊的文本。 - + output: 这是一段带有表情符号的文本。 """ try: emojis = special_characters.EMOJI - pattern = '|'.join(re.escape(value) for value in emojis) - find_pattern = '|'.join(f'[^,。!?,.!?]*{re.escape(value)}[^,。!?,.!?]*' for value in emojis) - replace_text = '' + pattern = "|".join(re.escape(value) for value in emojis) + find_pattern = "|".join( + f"[^,。!?,.!?]*{re.escape(value)}[^,。!?,.!?]*" for value in emojis + ) + replace_text = "" clean_text = re.sub(pattern, replace_text, text) @@ -256,39 +237,32 @@ def remove_emojis(text): text=text, pattern=pattern, find_pattern=find_pattern, - replace_text=replace_text + replace_text=replace_text, ) - + return { - 'status': 200, - 'message': '', - 'data': { - 'clean_data': clean_data, - 'text': clean_text - } + "status": 200, + "message": "", + "data": {"clean_data": clean_data, "text": clean_text}, } except Exception as ex: error = str(ex) - logger.error(''.join([ - f"{log_tag_const.CLEAN_TRANSFORM} Executing remove emojis failed\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.CLEAN_TRANSFORM} Executing remove emojis failed\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } + return {"status": 400, "message": error, "data": traceback.format_exc()} -def _find_clean_data( - text, - pattern, - find_pattern, - replace_text -): + +def _find_clean_data(text, pattern, find_pattern, replace_text): """find clean data for pre_content and post_content. - + text: text; pattern: ; find_pattern: ; @@ -299,9 +273,6 @@ def _find_clean_data( sentences = re.findall(find_pattern, text) for sentence in sentences: post_content = re.sub(pattern, replace_text, sentence) - clean_data.append({ - 'pre_content': sentence, - 'post_content': post_content - }) + clean_data.append({"pre_content": sentence, "post_content": post_content}) return clean_data diff --git a/pypi/data-processing/src/transform/text/privacy_transform.py b/pypi/data-processing/src/transform/text/privacy_transform.py index d046d857d..b83fa3ff0 100644 --- a/pypi/data-processing/src/transform/text/privacy_transform.py +++ b/pypi/data-processing/src/transform/text/privacy_transform.py @@ -22,430 +22,358 @@ logger = logging.getLogger(__name__) -def remove_email( - text, - replace_string=None -): +def remove_email(text, replace_string=None): """Replace email info with the user defined string. - + text: text; replace_string: the text is used to replace the email info; usage: input: 如果需要可以联系官方邮箱:172817631@qq.com马上申请为你开通 - + output: 如果需要可以联系官方邮箱:xxxxxx马上申请为你开通 """ try: if replace_string is None: - replace_string = 'xxxxxx' + replace_string = "xxxxxx" + + pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" + find_pattern = ( + r"[^,。!?,.!?]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}[^,。!?,.!?]*" + ) - pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' - find_pattern = r'[^,。!?,.!?]*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}[^,。!?,.!?]*' - clean_text = re.sub(pattern, replace_string, text) clean_data = _find_clean_data( text=text, pattern=pattern, find_pattern=find_pattern, - replace_string=replace_string + replace_string=replace_string, ) return { - 'status': 200, - 'message': '', - 'data': { - 'clean_data': clean_data, - 'text': clean_text - } + "status": 200, + "message": "", + "data": {"clean_data": clean_data, "text": clean_text}, } except Exception as ex: - logger.error(''.join([ - f"{log_tag_const.CLEAN_TRANSFORM} Execute removing email.\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) - return { - 'status': 400, - 'message': str(ex), - 'data': traceback.format_exc() - } - -def remove_ip_address( - text, - replace_string=None -): + logger.error( + "".join( + [ + f"{log_tag_const.CLEAN_TRANSFORM} Execute removing email.\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + return {"status": 400, "message": str(ex), "data": traceback.format_exc()} + + +def remove_ip_address(text, replace_string=None): """the ip addresses are replaced with xxxxxx. - + text: text; replace_string: the text is used to replace the email info; usage: input: 服务器登陆ip为192.168.255.255 - + output: 服务器登陆ip为xxxxxx """ try: if replace_string is None: - replace_string = 'xxxxxx' - - pattern = ''.join([ - r'((?:(?:1[0-9][0-9]\.)|(?:2[0-4][0-9]\.)|', - r'(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))', - r'{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|', - r'(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|', - r'([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4})' - ]) - - find_pattern = ''.join([ - r'([^,。!?,.!?]*)', - pattern, - r'([^,。!?,.!?]*)' - ]) - - clean_text = re.sub(pattern=pattern, - repl=replace_string, - string=text) + replace_string = "xxxxxx" + + pattern = "".join( + [ + r"((?:(?:1[0-9][0-9]\.)|(?:2[0-4][0-9]\.)|", + r"(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))", + r"{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|", + r"(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|", + r"([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4})", + ] + ) + + find_pattern = "".join([r"([^,。!?,.!?]*)", pattern, r"([^,。!?,.!?]*)"]) + + clean_text = re.sub(pattern=pattern, repl=replace_string, string=text) clean_data = [] sentences = re.findall(find_pattern, text) for sentence in sentences: - sentence = ''.join([ - sentence[0], - sentence[1], - sentence[3] - ]) + sentence = "".join([sentence[0], sentence[1], sentence[3]]) post_content = re.sub(pattern, replace_string, sentence) - clean_data.append({ - 'pre_content': sentence, - 'post_content': post_content - }) + clean_data.append({"pre_content": sentence, "post_content": post_content}) return { - 'status': 200, - 'message': '', - 'data': { - 'clean_data': clean_data, - 'text': clean_text - } + "status": 200, + "message": "", + "data": {"clean_data": clean_data, "text": clean_text}, } except Exception as ex: error = str(ex) - logger.error(''.join([ - f"{log_tag_const.PRIVACY_TRANSFORM} Executing remove email failed\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.PRIVACY_TRANSFORM} Executing remove email failed\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + + return {"status": 400, "message": error, "data": traceback.format_exc()} - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } -def remove_phone( - text, - replace_string=None -): +def remove_phone(text, replace_string=None): """the phone are replaced with xxxxxx. - + text: text; replace_string: the text is used to replace the email info; usage: input: 12345678910, 我的手机号是: 18617261536,我的座机号是: 029-1234567 - + output: 12345678910, 我的手机号是: xxxxxx,我的座机号是: 029-1234567 """ try: if replace_string is None: - replace_string = 'xxxxxx' + replace_string = "xxxxxx" - pattern = r'((\+|00)86)?(1)((3[\d])|(4[5,6,7,9])|(5[0-3,5-9])|(6[5-7])|(7[0-8])|(8[\d])|(9[1,8,9]))(\d{8})(?![0-9])' - find_pattern = ''.join([ - r'([^,。!?,.!?]*)', - pattern, - r'([^,。!?,.!?]*)' - ]) + pattern = r"((\+|00)86)?(1)((3[\d])|(4[5,6,7,9])|(5[0-3,5-9])|(6[5-7])|(7[0-8])|(8[\d])|(9[1,8,9]))(\d{8})(?![0-9])" + find_pattern = "".join([r"([^,。!?,.!?]*)", pattern, r"([^,。!?,.!?]*)"]) - clean_text = re.sub(pattern=pattern, - repl=replace_string, - string=text) + clean_text = re.sub(pattern=pattern, repl=replace_string, string=text) clean_data = [] sentences = re.findall(find_pattern, text) for sentence in sentences: - sentence = ''.join([ - sentence[0], - sentence[3], - sentence[4], - sentence[12], - sentence[13] - ]) + sentence = "".join( + [sentence[0], sentence[3], sentence[4], sentence[12], sentence[13]] + ) post_content = re.sub(pattern, replace_string, sentence) - clean_data.append({ - 'pre_content': sentence, - 'post_content': post_content - }) + clean_data.append({"pre_content": sentence, "post_content": post_content}) return { - 'status': 200, - 'message': '', - 'data': { - 'clean_data': clean_data, - 'text': clean_text - } + "status": 200, + "message": "", + "data": {"clean_data": clean_data, "text": clean_text}, } except Exception as ex: error = str(ex) - logger.error(''.join([ - f"{log_tag_const.PRIVACY_TRANSFORM} Executing remove phone failed\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.PRIVACY_TRANSFORM} Executing remove phone failed\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + + return {"status": 400, "message": error, "data": traceback.format_exc()} - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } -def remove_id_card( - text, - replace_string=None -): +def remove_id_card(text, replace_string=None): """the phone are replaced with xxxxxx. - + text: text; replace_string: the text is used to replace the email info; usage: input: 身份证号1:123451230112121234,身份证号2:12345123011212123x,位身份证号3:123456780009876 - + output: 身份证号1:xxxxxx,身份证号2:xxxxxx,位身份证号3:xxxxxx """ try: if replace_string is None: - replace_string = 'xxxxxx' + replace_string = "xxxxxx" id_card_regex = [ - r'\b([1-9]\d{5}[1-9]\d{3})((0\d)|(1[0-2]))(([0|1|2]\d)|(3[0-1]))(\d{3}[0-9Xx])(?![0-9])', - r'\b([1-9]\d{7})((0\d)|(1[0-2]))(([0-2][1-9])|(3[0-1]))(\d{2}[0-9Xx])(?![0-9])' + r"\b([1-9]\d{5}[1-9]\d{3})((0\d)|(1[0-2]))(([0|1|2]\d)|(3[0-1]))(\d{3}[0-9Xx])(?![0-9])", + r"\b([1-9]\d{7})((0\d)|(1[0-2]))(([0-2][1-9])|(3[0-1]))(\d{2}[0-9Xx])(?![0-9])", ] clean_data = [] for regex_exp in id_card_regex: - find_pattern = ''.join([ - r'([^,。!?,.!?]*)', - regex_exp, - r'([^,。!?,.!?]*)' - ]) + find_pattern = "".join([r"([^,。!?,.!?]*)", regex_exp, r"([^,。!?,.!?]*)"]) sentences = re.findall(find_pattern, text) - - text = re.sub(pattern=regex_exp, - repl=replace_string, - string=text) + + text = re.sub(pattern=regex_exp, repl=replace_string, string=text) for sentence in sentences: - sentence = ''.join([ - sentence[0], - sentence[1], - sentence[2], - sentence[5], - sentence[8], - sentence[9] - ]) + sentence = "".join( + [ + sentence[0], + sentence[1], + sentence[2], + sentence[5], + sentence[8], + sentence[9], + ] + ) post_content = re.sub(regex_exp, replace_string, sentence) - clean_data.append({ - 'pre_content': sentence, - 'post_content': post_content - }) + clean_data.append( + {"pre_content": sentence, "post_content": post_content} + ) return { - 'status': 200, - 'message': '', - 'data': { - 'clean_data': clean_data, - 'text': text - } + "status": 200, + "message": "", + "data": {"clean_data": clean_data, "text": text}, } except Exception as ex: error = str(ex) - logger.error(''.join([ - f"{log_tag_const.PRIVACY_TRANSFORM} Executing remove id card failed\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.PRIVACY_TRANSFORM} Executing remove id card failed\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } + return {"status": 400, "message": error, "data": traceback.format_exc()} -def remove_weixin( - text, - replace_string=None -): + +def remove_weixin(text, replace_string=None): """the weixin are replaced with xxxxxx. - + text: text; replace_string: the text is used to replace the email info; usage: input: 我的微信号:qw123456 - + output: 我的xxxxxx """ try: if replace_string is None: - replace_string = 'xxxxxx' + replace_string = "xxxxxx" weixin_regex = [ - r'vxin[:|:][a-zA-Z0-9{3,20}]+', - r'vx[:|:][a-zA-Z0-9{3,20}]+', - r'VX[:|:][a-zA-Z0-9{3,20}]+', - r'Vxin[:|:][a-zA-Z0-9{3,20}]+', - r'wx[:|:][a-zA-Z0-9{3,20}]+', - r'WX[:|:][a-zA-Z0-9{3,20}]+', - r'wei xin[:|:][a-zA-Z0-9{3,20}]+', - r'weixin[:|:][a-zA-Z0-9{3,20}]+', - r'微信[:|:][a-zA-Z0-9{3,20}]+', - r'微信号[:|:][a-zA-Z0-9{3,20}]+', - r'薇信[:|:][a-zA-Z0-9{3,20}]+', - r'薇信号[:|:][a-zA-Z0-9{3,20}]+', - r'v信[:|:][a-zA-Z0-9{3,20}]+', - r'V信[:|:][a-zA-Z0-9{3,20}]+' + r"vxin[:|:][a-zA-Z0-9{3,20}]+", + r"vx[:|:][a-zA-Z0-9{3,20}]+", + r"VX[:|:][a-zA-Z0-9{3,20}]+", + r"Vxin[:|:][a-zA-Z0-9{3,20}]+", + r"wx[:|:][a-zA-Z0-9{3,20}]+", + r"WX[:|:][a-zA-Z0-9{3,20}]+", + r"wei xin[:|:][a-zA-Z0-9{3,20}]+", + r"weixin[:|:][a-zA-Z0-9{3,20}]+", + r"微信[:|:][a-zA-Z0-9{3,20}]+", + r"微信号[:|:][a-zA-Z0-9{3,20}]+", + r"薇信[:|:][a-zA-Z0-9{3,20}]+", + r"薇信号[:|:][a-zA-Z0-9{3,20}]+", + r"v信[:|:][a-zA-Z0-9{3,20}]+", + r"V信[:|:][a-zA-Z0-9{3,20}]+", ] clean_data = [] for regex_exp in weixin_regex: - find_pattern = ''.join([ - r'[^,。!?,.!?]*', - regex_exp, - r'[^,。!?,.!?]*' - ]) + find_pattern = "".join([r"[^,。!?,.!?]*", regex_exp, r"[^,。!?,.!?]*"]) sentences = re.findall(regex_exp, text) - - text = re.sub(pattern=regex_exp, - repl=replace_string, - string=text) + + text = re.sub(pattern=regex_exp, repl=replace_string, string=text) for sentence in sentences: post_content = re.sub(regex_exp, replace_string, sentence) - clean_data.append({ - 'pre_content': sentence, - 'post_content': post_content - }) + clean_data.append( + {"pre_content": sentence, "post_content": post_content} + ) return { - 'status': 200, - 'message': '', - 'data': { - 'clean_data': clean_data, - 'text': text - } + "status": 200, + "message": "", + "data": {"clean_data": clean_data, "text": text}, } except Exception as ex: error = str(ex) - logger.error(''.join([ - f"{log_tag_const.PRIVACY_TRANSFORM} Executing remove id card failed\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.PRIVACY_TRANSFORM} Executing remove id card failed\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + + return {"status": 400, "message": error, "data": traceback.format_exc()} - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } -def remove_bank_card( - text, - replace_string=None -): +def remove_bank_card(text, replace_string=None): """the remove bank card are replaced with xxxxxx. - + text: text; usage: input: 银行卡号1:1234567890123456,银行卡号2:12345678901234567,银行卡号3:1234567890123456789 - + output: 银行卡号1:xxxxxx,银行卡号2:12345678901234567,银行卡号3:xxxxxx """ try: if replace_string is None: - replace_string = 'xxxxxx' + replace_string = "xxxxxx" - pattern = r'\b([1-9]{1})(\d{15}|\d{18})(?![0-9])' - find_pattern = r'([^,。!?,.!?]*)\b([1-9]{1})(\d{15}|\d{18})((?![0-9])[^,。!?,.!?]*)' + pattern = r"\b([1-9]{1})(\d{15}|\d{18})(?![0-9])" + find_pattern = ( + r"([^,。!?,.!?]*)\b([1-9]{1})(\d{15}|\d{18})((?![0-9])[^,。!?,.!?]*)" + ) - clean_text = re.sub(pattern=pattern, - repl=replace_string, - string=text) + clean_text = re.sub(pattern=pattern, repl=replace_string, string=text) clean_data = _find_clean_data( text=text, pattern=pattern, find_pattern=find_pattern, - replace_string=replace_string + replace_string=replace_string, ) return { - 'status': 200, - 'message': '', - 'data': { - 'clean_data': clean_data, - 'text': clean_text - } + "status": 200, + "message": "", + "data": {"clean_data": clean_data, "text": clean_text}, } except Exception as ex: error = str(ex) - logger.error(''.join([ - f"{log_tag_const.PRIVACY_TRANSFORM} Executing remove email failed\n", - f"The tracing error is: \n{traceback.format_exc()}\n" - ])) + logger.error( + "".join( + [ + f"{log_tag_const.PRIVACY_TRANSFORM} Executing remove email failed\n", + f"The tracing error is: \n{traceback.format_exc()}\n", + ] + ) + ) + + return {"status": 400, "message": error, "data": traceback.format_exc()} - return { - 'status': 400, - 'message': error, - 'data': traceback.format_exc() - } -def _find_clean_data( - text, - replace_string, - pattern, - find_pattern -): +def _find_clean_data(text, replace_string, pattern, find_pattern): """find clean data for pre_content and post_content. - + text: text; pattern: ; find_pattern: ; replace_string: replace string for privacy - + """ clean_data = [] sentences = re.findall(find_pattern, text) for sentence in sentences: post_content = re.sub(pattern, replace_string, sentence) - clean_data.append({ - 'pre_content': sentence, - 'post_content': post_content - }) + clean_data.append({"pre_content": sentence, "post_content": post_content}) return clean_data diff --git a/pypi/data-processing/src/transform/text/support_type.py b/pypi/data-processing/src/transform/text/support_type.py index a20e74aaf..48bdeec5d 100644 --- a/pypi/data-processing/src/transform/text/support_type.py +++ b/pypi/data-processing/src/transform/text/support_type.py @@ -13,134 +13,133 @@ # limitations under the License. - def get_default_support_types(): """Get the default support types.""" return [ { - 'name': 'chunk_processing', - 'description': '拆分处理', - 'children': [ - { - 'name': 'qa_split', - 'enable': 'true', - 'zh_name': 'QA拆分', - 'description': '根据文件中的文档内容,自动将文件做 QA 拆分处理。' + "name": "chunk_processing", + "description": "拆分处理", + "children": [ + { + "name": "qa_split", + "enable": "true", + "zh_name": "QA拆分", + "description": "根据文件中的文档内容,自动将文件做 QA 拆分处理。", }, { - 'name': 'document_chunk', - 'enable': 'false', - 'zh_name': '文本分段', - 'description': '' - } - ] + "name": "document_chunk", + "enable": "false", + "zh_name": "文本分段", + "description": "", + }, + ], }, { - 'name': 'clean', - 'description': '异常清洗配置', - 'children': [ - { - 'name': 'remove_invisible_characters', - 'enable': 'true', - 'zh_name': '移除不可见字符', - 'description': '移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围' + "name": "clean", + "description": "异常清洗配置", + "children": [ + { + "name": "remove_invisible_characters", + "enable": "true", + "zh_name": "移除不可见字符", + "description": "移除ASCII中的一些不可见字符, 如0-32 和127-160这两个范围", }, { - 'name': 'space_standardization', - 'enable': 'true', - 'zh_name': '空格处理', - 'description': '将不同的unicode空格比如u2008, 转成正常的空格' + "name": "space_standardization", + "enable": "true", + "zh_name": "空格处理", + "description": "将不同的unicode空格比如u2008, 转成正常的空格", }, { - 'name': 'remove_garbled_text', - 'enable': 'false', - 'zh_name': '去除乱码', - 'description': '去除乱码和无意义的unicode' + "name": "remove_garbled_text", + "enable": "false", + "zh_name": "去除乱码", + "description": "去除乱码和无意义的unicode", }, { - 'name': 'traditional_to_simplified', - 'enable': 'false', - 'zh_name': '繁转简', - 'description': '繁体转简体,如“不經意,妳的笑容”清洗成“不经意,你的笑容”' + "name": "traditional_to_simplified", + "enable": "false", + "zh_name": "繁转简", + "description": "繁体转简体,如“不經意,妳的笑容”清洗成“不经意,你的笑容”", }, { - 'name': 'remove_html_tag', - 'enable': 'false', - 'zh_name': '去除网页标识符', - 'description': '移除文档中的html标签, 如,,

    等' + "name": "remove_html_tag", + "enable": "false", + "zh_name": "去除网页标识符", + "description": "移除文档中的html标签, 如,,

    等", }, { - 'name': 'remove_emojis', - 'enable': 'false', - 'zh_name': '去除表情', - 'description': '去除文档中的表情,如‘🐰’, ‘🧑🏼’等' - } - ] + "name": "remove_emojis", + "enable": "false", + "zh_name": "去除表情", + "description": "去除文档中的表情,如‘🐰’, ‘🧑🏼’等", + }, + ], }, { - 'name': 'filtration', - 'description': '数据过滤配置', - 'children': [ - { - 'name': 'character_duplication_rate', - 'enable': 'false', - 'zh_name': '字重复率过滤', - 'description': '如果字重复率太高,意味着文档中重复的字太多,文档会被过滤掉' + "name": "filtration", + "description": "数据过滤配置", + "children": [ + { + "name": "character_duplication_rate", + "enable": "false", + "zh_name": "字重复率过滤", + "description": "如果字重复率太高,意味着文档中重复的字太多,文档会被过滤掉", }, { - 'name': 'word_duplication_rate', - 'enable': 'false', - 'zh_name': '词重复率过滤', - 'description': '如果词重复率太高,意味着文档中重复的词太多,文档会被过滤掉' + "name": "word_duplication_rate", + "enable": "false", + "zh_name": "词重复率过滤", + "description": "如果词重复率太高,意味着文档中重复的词太多,文档会被过滤掉", }, { - 'name': 'special_character_rate', - 'enable': 'false', - 'zh_name': '特殊字符串率', - 'description': '如果特殊字符率太高,意味着文档中特殊字符太多,文档会被过滤掉' + "name": "special_character_rate", + "enable": "false", + "zh_name": "特殊字符串率", + "description": "如果特殊字符率太高,意味着文档中特殊字符太多,文档会被过滤掉", }, { - 'name': 'pornography_violence_word_rate', - 'enable': 'false', - 'zh_name': '色情暴力词率', - 'description': '如果色情暴力词率太高,文档会被过滤掉' - } - ] + "name": "pornography_violence_word_rate", + "enable": "false", + "zh_name": "色情暴力词率", + "description": "如果色情暴力词率太高,文档会被过滤掉", + }, + ], }, { - 'name': 'duplicates', - 'description': '数据去重配置', - 'children': [ - { - 'name': 'simhash', - 'enable': 'false', - 'zh_name': 'Simhash', - 'description': '根据海明距离计算文档相似度, 相似度<=海明距离,认为两个文档相似。(范围:4-6)' + "name": "duplicates", + "description": "数据去重配置", + "children": [ + { + "name": "simhash", + "enable": "false", + "zh_name": "Simhash", + "description": "根据海明距离计算文档相似度, 相似度<=海明距离,认为两个文档相似。(范围:4-6)", } - ] + ], }, { - 'name': 'privacy_erosion', - 'description': '数据隐私配置', - 'children': [ - { - 'name': 'remove_email', - 'enable': 'true', - 'zh_name': '去除Email', - 'description': '去除email地址' + "name": "privacy_erosion", + "description": "数据隐私配置", + "children": [ + { + "name": "remove_email", + "enable": "true", + "zh_name": "去除Email", + "description": "去除email地址", }, { - 'name': 'remove_ip_address', - 'enable': 'false', - 'zh_name': '去除IP地址', - 'description': '去除IPv4 或者 IPv6 地址' + "name": "remove_ip_address", + "enable": "false", + "zh_name": "去除IP地址", + "description": "去除IPv4 或者 IPv6 地址", }, { - 'name': 'remove_number', - 'enable': 'false', - 'zh_name': '去除数字', - 'description': '去除数字和字母数字标识符,如电话号码、信用卡号、十六进制散列等,同时跳过年份和简单数字的实例' - } - ] - } + "name": "remove_number", + "enable": "false", + "zh_name": "去除数字", + "description": "去除数字和字母数字标识符,如电话号码、信用卡号、十六进制散列等,同时跳过年份和简单数字的实例", + }, + ], + }, ] diff --git a/pypi/data-processing/src/utils/class_utils.py b/pypi/data-processing/src/utils/class_utils.py index 8388f814e..c6f02bfd8 100644 --- a/pypi/data-processing/src/utils/class_utils.py +++ b/pypi/data-processing/src/utils/class_utils.py @@ -31,4 +31,5 @@ def __call__(cls, *args: Any, **kwargs: Any) -> Any: class AbstractSingleton(abc.ABC, metaclass=Singleton): """Abstract singleton class for ensuring only one instance of a class""" + pass diff --git a/pypi/data-processing/src/utils/csv_utils.py b/pypi/data-processing/src/utils/csv_utils.py index da5da0eb2..71f560392 100644 --- a/pypi/data-processing/src/utils/csv_utils.py +++ b/pypi/data-processing/src/utils/csv_utils.py @@ -22,13 +22,9 @@ logger = logging.getLogger(__name__) -def save_csv( - file_name, - phase_value, - data -): +def save_csv(file_name, phase_value, data): """Save the csv file. - + file_name: file name; phase_value: phase value """ @@ -39,16 +35,19 @@ def save_csv( if not os.path.exists(directory_path): os.makedirs(directory_path) - file_path = directory_path + '/' + file_name + file_path = directory_path + "/" + file_name - logger.debug(''.join([ - f"{log_tag_const.CSV_HANDLE} Save a csv file.\n", - f"file path: {file_path}" - ])) + logger.debug( + "".join( + [ + f"{log_tag_const.CSV_HANDLE} Save a csv file.\n", + f"file path: {file_path}", + ] + ) + ) - with open(file_path, 'w', newline='') as file: + with open(file_path, "w", newline="") as file: writer = csv.writer(file) writer.writerows(data) return file_path - diff --git a/pypi/data-processing/src/utils/date_time_utils.py b/pypi/data-processing/src/utils/date_time_utils.py index 64772df2a..fc9da447d 100644 --- a/pypi/data-processing/src/utils/date_time_utils.py +++ b/pypi/data-processing/src/utils/date_time_utils.py @@ -23,7 +23,7 @@ def now_str(): def now_utc_str(): - return datetime.datetime.now(pytz.utc).strftime('%Y-%m-%dT%H:%M:%SZ') + return datetime.datetime.now(pytz.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def now_str_for_day(): @@ -42,13 +42,7 @@ def timestamp_to_str_second(timestamp): return f"{datetime.datetime.fromtimestamp(timestamp):%Y-%m-%d %H:%M:%S}" -def chage_datetime_fromat( - date_time, - from_format -): - my_date_time = datetime.datetime.strptime( - date_time, - from_format - ) +def chage_datetime_fromat(date_time, from_format): + my_date_time = datetime.datetime.strptime(date_time, from_format) - return my_date_time.strftime(opt.get('to_format', '%Y-%m-%d %H:%M:%S')) + return my_date_time.strftime(opt.get("to_format", "%Y-%m-%d %H:%M:%S")) diff --git a/pypi/data-processing/src/utils/docx_utils.py b/pypi/data-processing/src/utils/docx_utils.py index 2e8e1591e..649de8dfc 100644 --- a/pypi/data-processing/src/utils/docx_utils.py +++ b/pypi/data-processing/src/utils/docx_utils.py @@ -15,11 +15,9 @@ import docx -def get_content( - file_path -): +def get_content(file_path): """Get the content from a word docx file. - + file_path: file path; """ doc = docx.Document(file_path) @@ -29,4 +27,4 @@ def get_content( text = para.text content += text - return content + return content diff --git a/pypi/data-processing/src/utils/file_utils.py b/pypi/data-processing/src/utils/file_utils.py index b3460e10b..fc38288ad 100644 --- a/pypi/data-processing/src/utils/file_utils.py +++ b/pypi/data-processing/src/utils/file_utils.py @@ -17,27 +17,23 @@ from pathlib import Path -def get_file_name( - file_name, - handle_name -): +def get_file_name(file_name, handle_name): """Get file name.""" - file_extension = file_name.split('.')[-1].lower() - file_name_without_extension = file_name.rsplit('.', 1)[0] + file_extension = file_name.split(".")[-1].lower() + file_name_without_extension = file_name.rsplit(".", 1)[0] - return file_name_without_extension + '_' + handle_name + '.' + file_extension + return file_name_without_extension + "_" + handle_name + "." + file_extension def get_temp_file_path(): """Get temp file path""" current_directory = os.getcwd() - csv_file_path = os.path.join(current_directory, 'file_handle/temp_file/') + csv_file_path = os.path.join(current_directory, "file_handle/temp_file/") return csv_file_path - def delete_file(file_path): """Delete file""" os.remove(file_path) @@ -51,6 +47,7 @@ def get_file_extension(file_name): return file_extension + def get_file_name_without_extension(file_name): """Get file name without extension""" path = Path(file_name) diff --git a/pypi/data-processing/src/utils/json_utils.py b/pypi/data-processing/src/utils/json_utils.py index c78eb918b..f9de34042 100644 --- a/pypi/data-processing/src/utils/json_utils.py +++ b/pypi/data-processing/src/utils/json_utils.py @@ -18,40 +18,25 @@ import ujson -def get_str_empty( - json_item, - json_key -): - if json_item.get(json_key, '') is None: - return '' +def get_str_empty(json_item, json_key): + if json_item.get(json_key, "") is None: + return "" - return json_item.get(json_key, '') + return json_item.get(json_key, "") def write_json_file( - file_name, - data, - indent=None, - ensure_ascii=None, - escape_forward_slashes=None + file_name, data, indent=None, ensure_ascii=None, escape_forward_slashes=None ): file_name = Path(file_name) - with open(file_name, 'w', encoding='utf-8') as outfile: - dump( - data, - outfile, - indent, - ensure_ascii, - escape_forward_slashes - ) - - -def read_json_file( - file_name -): + with open(file_name, "w", encoding="utf-8") as outfile: + dump(data, outfile, indent, ensure_ascii, escape_forward_slashes) + + +def read_json_file(file_name): file_name = Path(file_name) json_result = None - with open(file_name, 'r', encoding='utf-8') as f: + with open(file_name, "r", encoding="utf-8") as f: json_result = ujson.load(f) return json_result @@ -62,7 +47,7 @@ def dumps( indent=None, ensure_ascii=None, sort_keys=None, - escape_forward_slashes=None + escape_forward_slashes=None, ): if indent is None: indent = 2 @@ -73,30 +58,27 @@ def dumps( if escape_forward_slashes is None: escape_forward_slashes = False - return ujson.dumps(json_data, - indent=indent, - ensure_ascii=ensure_ascii, - sort_keys=sort_keys, - escape_forward_slashes=escape_forward_slashes) + return ujson.dumps( + json_data, + indent=indent, + ensure_ascii=ensure_ascii, + sort_keys=sort_keys, + escape_forward_slashes=escape_forward_slashes, + ) -def dump( - json_data, - file, - indent=None, - ensure_ascii=None, - escape_forward_slashes=None -): +def dump(json_data, file, indent=None, ensure_ascii=None, escape_forward_slashes=None): if indent is None: - indent=2 + indent = 2 if ensure_ascii is None: - ensure_ascii=False + ensure_ascii = False if escape_forward_slashes is None: - escape_forward_slashes=False - - ujson.dump(json_data, - file, - indent=indent, - ensure_ascii=ensure_ascii, - escape_forward_slashes=escape_forward_slashes) + escape_forward_slashes = False + ujson.dump( + json_data, + file, + indent=indent, + ensure_ascii=ensure_ascii, + escape_forward_slashes=escape_forward_slashes, + ) diff --git a/pypi/data-processing/src/utils/log_utils.py b/pypi/data-processing/src/utils/log_utils.py index 068218b2e..1c0dc91b6 100644 --- a/pypi/data-processing/src/utils/log_utils.py +++ b/pypi/data-processing/src/utils/log_utils.py @@ -18,10 +18,7 @@ from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler -def init_config( - source_type, - log_dir -): +def init_config(source_type, log_dir): """Initialize the log config""" # Disable debug logs for the Kubernetes Python client logging.getLogger("kubernetes").setLevel(logging.WARNING) @@ -34,7 +31,7 @@ def init_config( f'{log_dir}/{source_type}_{datetime.datetime.now().strftime("%Y-%m-%d")}.log', when="midnight", interval=1, - backupCount=30 + backupCount=30, ) # 按天生成日志文件,最多保存30天的日志文件 file_handler.setLevel(logging.DEBUG) @@ -44,7 +41,7 @@ def init_config( f'log/{source_type}_{datetime.datetime.now().strftime("%Y-%m-%d")}.err.log', when="midnight", interval=1, - backupCount=30 + backupCount=30, ) # 按天生成日志文件,最多保存30天的日志文件 error_file_handler.suffix = "%Y-%m-%d" # 文件名的时间格式 @@ -52,10 +49,6 @@ def init_config( logging.basicConfig( level=logging.DEBUG, - format='%(asctime)s [%(levelname)s] - %(message)s', - handlers=[ - file_handler, - error_file_handler, - logging.StreamHandler() - ] + format="%(asctime)s [%(levelname)s] - %(message)s", + handlers=[file_handler, error_file_handler, logging.StreamHandler()], ) diff --git a/pypi/data-processing/src/utils/pdf_utils.py b/pypi/data-processing/src/utils/pdf_utils.py index 97774243f..5bf556b46 100644 --- a/pypi/data-processing/src/utils/pdf_utils.py +++ b/pypi/data-processing/src/utils/pdf_utils.py @@ -16,11 +16,9 @@ from pypdf import PdfReader -def get_content( - file_path -): +def get_content(file_path): """Get the content from a pdf file. - + file_path: file path; """ reader = PdfReader(file_path) @@ -29,6 +27,4 @@ def get_content( for page in reader.pages: content += page.extract_text() - return content - - + return content diff --git a/pypi/data-processing/src/utils/sanic_utils.py b/pypi/data-processing/src/utils/sanic_utils.py index b0e83f006..9a5ce83bf 100644 --- a/pypi/data-processing/src/utils/sanic_utils.py +++ b/pypi/data-processing/src/utils/sanic_utils.py @@ -16,27 +16,32 @@ import logging import traceback +from common import log_tag_const from sanic.handlers import ErrorHandler from sanic.response import json -from common import log_tag_const - logger = logging.getLogger(__name__) class CustomErrorHandler(ErrorHandler): """Custom the error handler for the sanic app""" + def default(self, request, exception): status_code = getattr(exception, "status_code", 500) - logger.error(''.join([ - f"{log_tag_const.WEB_SERVER_ERROR} The url has a error.\n", - f"url: {request.url}\n", - f"status code: {status_code} \n", - f"error trace: \n{traceback.format_exc()}" - ])) - return json({ - 'status': status_code, - 'message': str(exception), - 'data': traceback.format_exc() - }) - + logger.error( + "".join( + [ + f"{log_tag_const.WEB_SERVER_ERROR} The url has a error.\n", + f"url: {request.url}\n", + f"status code: {status_code} \n", + f"error trace: \n{traceback.format_exc()}", + ] + ) + ) + return json( + { + "status": status_code, + "message": str(exception), + "data": traceback.format_exc(), + } + ) diff --git a/pypi/ragas_once/pyproject.toml b/pypi/ragas_once/pyproject.toml index c80b42e96..5352f4b3f 100644 --- a/pypi/ragas_once/pyproject.toml +++ b/pypi/ragas_once/pyproject.toml @@ -1,7 +1,7 @@ [build-system] requires = [ - "setuptools>=61.0" - "ragas" + "setuptools>=61.0", + "ragas", "langchain==0.0.354" ] build-backend = "setuptools.build_meta" diff --git a/pypi/ragas_once/ragas_once/cli.py b/pypi/ragas_once/ragas_once/cli.py index 35d689200..8a798b18d 100644 --- a/pypi/ragas_once/ragas_once/cli.py +++ b/pypi/ragas_once/ragas_once/cli.py @@ -1,9 +1,10 @@ import argparse + import pandas as pd import ragas_once.wrapper as pkg +from datasets import Dataset, load_dataset from ragas import evaluate -from datasets import Dataset -from datasets import load_dataset + def main(): """ @@ -15,22 +16,40 @@ def main(): Returns: The result of evaluating the test set using the specified metrics. """ - parser = argparse.ArgumentParser(description='RAGAS CLI') - parser.add_argument("--model", type=str, default="gpt-3.5-turbo", - help="Specifies the model to use for evaluation. Defaults to gpt-3.5-turbo.") - parser.add_argument("--apibase", type=str, default="https://api.openai.com/v1", - help="Specifies the base URL for the API. Defaults to OpenAI.") - parser.add_argument("--apikey", type=str, - help="Specifies the API key to authenticate requests.") - parser.add_argument("--embeddings", type=str, - help="Specifies Huggingface embeddings model (or its path) to use for evaluation. Will use OpenAI embeddings if not set.") - parser.add_argument("--metrics", type=list, default=[], - help="Specifies the metrics to use for evaluation.") - parser.add_argument("--dataset", type=str, - help="Specifies the path to the dataset for evaluation. Will use fiqa dataset if not set.") + parser = argparse.ArgumentParser(description="RAGAS CLI") + parser.add_argument( + "--model", + type=str, + default="gpt-3.5-turbo", + help="Specifies the model to use for evaluation. Defaults to gpt-3.5-turbo.", + ) + parser.add_argument( + "--apibase", + type=str, + default="https://api.openai.com/v1", + help="Specifies the base URL for the API. Defaults to OpenAI.", + ) + parser.add_argument( + "--apikey", type=str, help="Specifies the API key to authenticate requests." + ) + parser.add_argument( + "--embeddings", + type=str, + help="Specifies Huggingface embeddings model (or its path) to use for evaluation. Will use OpenAI embeddings if not set.", + ) + parser.add_argument( + "--metrics", + type=list, + default=[], + help="Specifies the metrics to use for evaluation.", + ) + parser.add_argument( + "--dataset", + type=str, + help="Specifies the path to the dataset for evaluation. Will use fiqa dataset if not set.", + ) args = parser.parse_args() - model = args.model api_base = args.apibase api_key = args.apikey @@ -42,18 +61,18 @@ def main(): embeddings_model_name = args.embeddings if embeddings_model_name: - embeddings = pkg.wrap_embeddings('huggingface', embeddings_model_name, None) + embeddings = pkg.wrap_embeddings("huggingface", embeddings_model_name, None) else: - embeddings = pkg.wrap_embeddings('openai', None, api_key) + embeddings = pkg.wrap_embeddings("openai", None, api_key) if dataset: data = pd.read_csv(dataset) - data['ground_truths'] = data['ground_truths'].apply(lambda x: x.split(';')) - data['contexts'] = data['contexts'].apply(lambda x: x.split(';')) + data["ground_truths"] = data["ground_truths"].apply(lambda x: x.split(";")) + data["contexts"] = data["contexts"].apply(lambda x: x.split(";")) test_set = Dataset.from_pandas(data) else: - print('test_set not provided, using fiqa dataset') - fiqa = load_dataset('explodinggradients/fiqa', 'ragas_eval') + print("test_set not provided, using fiqa dataset") + fiqa = load_dataset("explodinggradients/fiqa", "ragas_eval") test_set = fiqa["baseline"].select(range(5)) ms = pkg.set_metrics(metrics, judge_model, embeddings) @@ -61,8 +80,8 @@ def main(): result = evaluate(test_set, ms) print(result) - result.to_pandas().to_csv('result.csv') + result.to_pandas().to_csv("result.csv") + -if __name__ == '__main__': +if __name__ == "__main__": main() - diff --git a/pypi/ragas_once/ragas_once/wrapper.py b/pypi/ragas_once/ragas_once/wrapper.py index 642c5b266..454221b05 100644 --- a/pypi/ragas_once/ragas_once/wrapper.py +++ b/pypi/ragas_once/ragas_once/wrapper.py @@ -1,35 +1,26 @@ import os + from datasets import Dataset from langchain.chat_models import ChatOpenAI -from ragas.llms import RagasLLM -from ragas.llms import LangchainLLM -from ragas.embeddings import RagasEmbeddings -from ragas.embeddings import OpenAIEmbeddings -from ragas.embeddings import HuggingfaceEmbeddings +from ragas.embeddings import (HuggingfaceEmbeddings, OpenAIEmbeddings, + RagasEmbeddings) +from ragas.llms import LangchainLLM, RagasLLM +from ragas.metrics import (answer_correctness, answer_relevancy, + answer_similarity, context_precision, + context_recall, context_relevancy, faithfulness) from ragas.metrics.base import Metric -from ragas.metrics import ( - context_precision, - context_recall, - context_relevancy, - answer_relevancy, - answer_correctness, - answer_similarity, - faithfulness -) - DEFAULT_METRICS = [ - "answer_relevancy", - "context_precision", - "faithfulness", - "context_recall", - "context_relevancy" - ] + "answer_relevancy", + "context_precision", + "faithfulness", + "context_recall", + "context_relevancy", +] + def wrap_langchain_llm( - model: str, - api_base: str | None, - api_key: str | None + model: str, api_base: str | None, api_key: str | None ) -> LangchainLLM: """ Initializes and returns an instance of the LangchainLLM class. @@ -50,27 +41,23 @@ def wrap_langchain_llm( - The environment variables OPENAI_API_KEY and OPENAI_API_BASE are set to the provided api_key and api_base. """ if api_base is None: - print('api_base not provided, assuming OpenAI default') + print("api_base not provided, assuming OpenAI default") if api_key is None: raise ValueError("api_key must be provided") - os.environ['OPENAI_API_KEY'] = api_key + os.environ["OPENAI_API_KEY"] = api_key base = ChatOpenAI(model_name=model) else: - os.environ['OPENAI_API_BASE'] = api_base + os.environ["OPENAI_API_BASE"] = api_base if api_key: - os.environ['OPENAI_API_KEY'] = api_key + os.environ["OPENAI_API_KEY"] = api_key base = ChatOpenAI( - model_name=model, - openai_api_key=api_key, - openai_api_base=api_base + model_name=model, openai_api_key=api_key, openai_api_base=api_base ) return LangchainLLM(llm=base) def set_metrics( - metrics: list[str], - llm: RagasLLM | None, - embeddings: RagasEmbeddings | None + metrics: list[str], llm: RagasLLM | None, embeddings: RagasEmbeddings | None ) -> list[Metric]: """ Sets the metrics for evaluation. @@ -97,30 +84,29 @@ def set_metrics( if not metrics: metrics = DEFAULT_METRICS for m in metrics: - if m == 'context_precision': + if m == "context_precision": ms.append(context_precision) - elif m == 'context_recall': + elif m == "context_recall": ms.append(context_recall) - elif m == 'context_relevancy': + elif m == "context_relevancy": ms.append(context_relevancy) - elif m == 'answer_relevancy': + elif m == "answer_relevancy": ms.append(answer_relevancy) - elif m == 'answer_correctness': + elif m == "answer_correctness": ms.append(answer_correctness) - elif m == 'answer_similarity': + elif m == "answer_similarity": ms.append(answer_similarity) - elif m == 'faithfulness': + elif m == "faithfulness": ms.append(faithfulness) return ms + def wrap_embeddings( - model_type: str, - model_name: str | None, - api_key: str | None + model_type: str, model_name: str | None, api_key: str | None ) -> RagasEmbeddings: - if model_type == 'openai': + if model_type == "openai": return OpenAIEmbeddings(api_key=api_key) - elif model_type == 'huggingface': + elif model_type == "huggingface": return HuggingfaceEmbeddings(model_name=model_name) else: - raise ValueError(f"Invalid model type: {model_type}") \ No newline at end of file + raise ValueError(f"Invalid model type: {model_type}") diff --git a/pypi/ragas_once/setup.py b/pypi/ragas_once/setup.py index a448973c6..46911caf8 100644 --- a/pypi/ragas_once/setup.py +++ b/pypi/ragas_once/setup.py @@ -1,6 +1,6 @@ # make a setup.py for ragacli package -from setuptools import setup, find_packages +from setuptools import find_packages, setup with open("README.md", "r", encoding="utf-8") as f: long_description = f.read() @@ -22,12 +22,8 @@ ], python_requires=">=3.8", install_requires=[ - 'ragas', - 'langchain==0.0.354', + "ragas", + "langchain==0.0.354", ], - entry_points={ - 'console_scripts': [ - 'ro = ragas_once.cli:main' - ] - } -) \ No newline at end of file + entry_points={"console_scripts": ["ro = ragas_once.cli:main"]}, +)