From 789cb017d2cf9cedc0ef0ca67aebda99886b1b46 Mon Sep 17 00:00:00 2001 From: wangxinbiao <1412146116@qq.com> Date: Mon, 4 Mar 2024 15:07:03 +0800 Subject: [PATCH] feat:add library to data process --- examples/document_transformers/try_pdf2csv.py | 44 +++ examples/ops/try_transform.py | 138 ++++++++ libs/cli/kubeagi_cli/cli.py | 6 + libs/cli/kubeagi_cli/convert.py | 74 +++++ .../document_transformers/__init__.py | 1 + .../document_transformers/pdf2csv.py | 306 ++++++++++++++++++ libs/core/kubeagi_core/embeddings/__init__.py | 0 libs/core/kubeagi_core/embeddings/openai.py | 44 +++ libs/core/kubeagi_core/ops/__init__.py | 0 .../kubeagi_core/ops/special_characters.py | 49 +++ libs/core/kubeagi_core/ops/transform.py | 227 +++++++++++++ libs/core/pyproject.toml | 4 + 12 files changed, 893 insertions(+) create mode 100644 examples/document_transformers/try_pdf2csv.py create mode 100644 examples/ops/try_transform.py create mode 100644 libs/cli/kubeagi_cli/convert.py create mode 100644 libs/core/kubeagi_core/document_transformers/__init__.py create mode 100644 libs/core/kubeagi_core/document_transformers/pdf2csv.py create mode 100644 libs/core/kubeagi_core/embeddings/__init__.py create mode 100644 libs/core/kubeagi_core/embeddings/openai.py create mode 100644 libs/core/kubeagi_core/ops/__init__.py create mode 100644 libs/core/kubeagi_core/ops/special_characters.py create mode 100644 libs/core/kubeagi_core/ops/transform.py diff --git a/examples/document_transformers/try_pdf2csv.py b/examples/document_transformers/try_pdf2csv.py new file mode 100644 index 0000000..3fa2d3d --- /dev/null +++ b/examples/document_transformers/try_pdf2csv.py @@ -0,0 +1,44 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from kubeagi_core.document_transformers.pdf2csv import PDF2CSVTransform + + +def test_pdf_to_csv(): + pdf_transformer = PDF2CSVTransform( + file_path="data/test.pdf", + llm_config={ + "model": "6ac7baa2-71e7-4ffc-bd49-9356e743ecbb", + "base_url": "http://fastchat-api.172.22.96.167.nip.io/v1", + "api_key": "fake", + "type": "openai", + "temperature": "0.7", + "max_tokens": "2048", + }, + data_clean_config=[ + {"type": "chinese_convert"}, + {"type": "remove_emojis"}, + {"type": "remove_email", "repl": ""}, + {"type": "remove_ip_address", "repl": ""}, + {"type": "remove_phone", "repl": ""}, + ], + output_dir="data", + ) + pdf_transformer.transform() + print("<<< Finished") + + +if __name__ == "__main__": + test_pdf_to_csv() diff --git a/examples/ops/try_transform.py b/examples/ops/try_transform.py new file mode 100644 index 0000000..49004df --- /dev/null +++ b/examples/ops/try_transform.py @@ -0,0 +1,138 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from kubeagi_core.ops.transform import DataConvert +from kubeagi_core.ops.transform import Clean +from kubeagi_core.ops.transform import FixUnicode + + +def test_traditional_to_simplified(): + print(">>> Starting traditional to simplified") + text = "風暴帶來的暫停使消防員和其他緊急反應人員得以進入禁區進行結構破壞評估。" + data_convert = DataConvert() + + clean_text = data_convert.chinese_convert(text) + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_bank_card_convert(): + print(">>> Starting convert bank card") + text = "银行卡号1:1234567890123456,银行卡号2:12345678901234567,银行卡号3:1234567890123456789" + data_convert = DataConvert() + + clean_text = data_convert.bank_card_convert(text=text, repl="xxxx") + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_email_convert(): + print(">>> Starting convert email") + text = "如果需要可以联系官方邮箱:172817631@qq.com马上申请为你开通" + data_convert = DataConvert() + + clean_text = data_convert.email_convert(text=text, repl="xxxx") + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_emoji_convert(): + print(">>> Starting convert emoji") + text = "这是一段带有表情符号😊的文本。" + data_convert = DataConvert() + + clean_text = data_convert.emojis_convert(text=text, repl="xxxx") + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_clean_html(): + print(">>> Starting convert emoji") + text = "
学员成绩单分析报告" + clean = Clean() + + clean_text = clean.clean_html(text=text) + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_id_card_convert(): + print(">>> Starting convert id card") + text = "身份证号1:123451230112121234, 身份证号2:12345123011212123x" + data_convert = DataConvert() + + clean_text = data_convert.id_card_convert(text=text, repl="xxxx") + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_ip_convert(): + print(">>> Starting convert ip") + text = "服务器登陆ip为192.168.255.255" + data_convert = DataConvert() + + clean_text = data_convert.ip_convert(text=text, repl="xxxx") + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_phone_convert(): + print(">>> Starting convert phone") + text = "手机号为18672615192" + data_convert = DataConvert() + + clean_text = data_convert.phone_convert(text=text, repl="xxxx") + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_weixin_convert(): + print(">>> Starting convert weixin") + text = "你的wx:qw123" + data_convert = DataConvert() + + clean_text = data_convert.weixin_convert(text=text, repl="xxxx") + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_fix_unicode(): + print(">>> Starting fix unicode") + text = "法律工作者。 — like this one." + fix_unicode = FixUnicode() + + clean_text = fix_unicode.process(text=text) + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_invisible_characterse_convert(): + print(">>> Starting convert invisible characterse") + text = "一户一表、水表出户、抄表到户" + data_convert = DataConvert() + + clean_text = data_convert.invisible_characters_convert(text=text, repl="") + print("<<< Finished") + print(f"clean text: {clean_text}") + + +def test_space_convert(): + print(">>> Starting convert space") + text = "41 行业大模型标准体系及能力架构研究报告行业大模型“千行百业”落地" + data_convert = DataConvert() + + clean_text = data_convert.space_convert(text=text, repl=" ") + print("<<< Finished") + print(f"clean text: {clean_text}") diff --git a/libs/cli/kubeagi_cli/cli.py b/libs/cli/kubeagi_cli/cli.py index a7b7a04..83859e2 100644 --- a/libs/cli/kubeagi_cli/cli.py +++ b/libs/cli/kubeagi_cli/cli.py @@ -16,11 +16,17 @@ import typer from typing_extensions import Annotated from kubeagi_core.evaluation.ragas_eval import RagasEval +from . import convert __version__ = "0.0.1" app = typer.Typer(no_args_is_help=True, add_completion=False) +app.add_typer( + convert.convert_cli, + name="convert", + help=convert.__doc__, +) @app.command() diff --git a/libs/cli/kubeagi_cli/convert.py b/libs/cli/kubeagi_cli/convert.py new file mode 100644 index 0000000..b89edb2 --- /dev/null +++ b/libs/cli/kubeagi_cli/convert.py @@ -0,0 +1,74 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import typer +import ujson + +from typing import List +from typing_extensions import Annotated + +from kubeagi_core.document_transformers.pdf2csv import PDF2CSVTransform + + +convert_cli = typer.Typer(no_args_is_help=True, add_completion=False) + + +@convert_cli.command() +def pdf( + file_path: Annotated[ + str, + typer.Argument( + help="file path", + ), + ] = None, + llm_config: Annotated[ + str, + typer.Argument( + help="llm config for generate qa", + ), + ] = None, + data_cleaning_config: Annotated[ + List[str], + typer.Option(help="data cleaning config"), + ] = [], + output_dir: Annotated[ + str, + typer.Option(help="file output path"), + ] = None, + chunk_size: Annotated[ + int, + typer.Option(help="text chunk size"), + ] = 500, + chunk_overlap: Annotated[ + int, + typer.Option(help="text chunk overlap"), + ] = 50, +): + """ + pdf transformer csv. + """ + if len(data_cleaning_config) > 0: + data_cleaning_config = [ujson.loads(s) for s in data_cleaning_config] + + pdf_transformer = PDF2CSVTransform( + file_path=file_path, + llm_config=ujson.loads(llm_config), + data_cleaning_config=data_cleaning_config, + output_dir=output_dir, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + ) + + return pdf_transformer.transform() diff --git a/libs/core/kubeagi_core/document_transformers/__init__.py b/libs/core/kubeagi_core/document_transformers/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/libs/core/kubeagi_core/document_transformers/__init__.py @@ -0,0 +1 @@ + diff --git a/libs/core/kubeagi_core/document_transformers/pdf2csv.py b/libs/core/kubeagi_core/document_transformers/pdf2csv.py new file mode 100644 index 0000000..9e23cf8 --- /dev/null +++ b/libs/core/kubeagi_core/document_transformers/pdf2csv.py @@ -0,0 +1,306 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import csv +import logging +import os + +from typing import Any, Dict, List +from pathlib import Path + +from kubeagi_core.document_chunks.spacy_splitter import SpacySplitter +from kubeagi_core.document_loaders import PDFLoader +from kubeagi_core.ops.transform import Clean, DataConvert, FixUnicode +from kubeagi_core.qa_provider.openai import QAProviderOpenAI +from kubeagi_core.qa_provider.zhipuai import QAProviderZhiPuAIOnline + +logger = logging.getLogger(__name__) + + +class PDF2CSVTransform: + """ + pdf to csv transform. + + Args: + file_path: file path. + llm_config: llm config for generate qa. + model: model name to use. + base_url: base URL path for API requests. + api_key: llm api key. + type: llm type. + temperature + top_p + max_tokens + prompt_template + retry_count: the number of retries when LLM model invocation fails. + retry_wait_seconds: the waiting time between each retry when invoking the model. + data_cleaning_config: data processing clean config. + type: what type of data processing. + NOTE: including the following types + remove_invisible_characters + space_standardization + fix_unicode + chinese_convert + remove_html_tag + remove_emojis + remove_email + remove_ip_address + remove_phone + remove_id_card + remove_weixin + remove_bank_card + repl: the replacement values for the data to be processed. + output_dir: file output path. + chunk_size: chunk size. + chunk_overlap: chunk overlap. + """ + + def __init__( + self, + file_path: str, + llm_config: Dict[str, Any], + data_cleaning_config: List[Dict[str, Any]] = None, + output_dir: str = None, + chunk_size: int = None, + chunk_overlap: int = None, + ): + if chunk_size is None: + chunk_size = 500 + if chunk_overlap is None: + chunk_overlap = 50 + if output_dir is None: + output_dir = os.path.dirname(file_path) + if data_cleaning_config is None: + data_cleaning_config = [] + + self._file_path = file_path + self._llm_config = llm_config + self._data_cleaning_config = data_cleaning_config + self._output_dir = output_dir + self._chunk_size = chunk_size + self._chunk_overlap = chunk_overlap + + def transform(self): + logger.info("start pdf transform csv") + # Text splitter + pdf_loader = PDFLoader(file_path=self._file_path) + docs = pdf_loader.load() + + text_splitter = SpacySplitter( + separator="\n\n", + pipeline="zh_core_web_sm", + chunk_size=self._chunk_size, + chunk_overlap=self._chunk_overlap, + ) + documents = text_splitter.split_documents(docs) + + res = self._data_transform(documents) + if res.get("status") != 200: + return res + + # save qa list for csv + qa_data_dict = [["q", "a", "file_name", "page_number", "chunk_content"]] + qa_data_dict.extend(res.get("data")) + + path = Path(self._file_path) + file_name_without_extension = path.stem + file_name = file_name_without_extension + ".csv" + output_file_path = self._output_dir + "/" + file_name + logger.info(f"file output path {output_file_path}") + + with open(output_file_path, "w", newline="") as file: + writer = csv.writer(file) + writer.writerows(qa_data_dict) + + return {"status": 200, "message": "", "data": qa_data_dict} + + def _data_transform(self, documents): + logger.info("start data cleaning") + qa_list = [] + for document in documents: + content = document.page_content.replace("\n", "") + + if len(self._data_cleaning_config) > 0: + # remove invisible characters + invisible_characters_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "remove_invisible_characters" + ] + if invisible_characters_item: + content = DataConvert().invisible_characters_convert( + text=content, repl=invisible_characters_item[0].get("repl", "") + ) + + # process for nonstandard space + space_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "space_standardization" + ] + if space_item: + content = DataConvert().space_convert( + text=content, repl=space_item[0].get("repl", " ") + ) + + # fix unicode + unicode_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "fix_unicode" + ] + if unicode_item: + content = FixUnicode().process(text=content) + + # process for Traditional Chinese to Simplified Chinese + chinese_convert_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "chinese_convert" + ] + if chinese_convert_item: + content = DataConvert().chinese_convert(text=content) + + # process for clean html code in text + html_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "remove_html_tag" + ] + if html_item: + content = Clean().clean_html(text=content) + + # process for remove emojis in text + emoji_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "remove_emojis" + ] + if emoji_item: + content = DataConvert().emojis_convert( + text=content, repl=space_item[0].get("repl", "") + ) + + # process for remove email in text + email_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "remove_email" + ] + if email_item: + content = DataConvert().email_convert( + text=content, repl=email_item[0].get("repl", "xxxx") + ) + + # process for remove ip addresses in text + ip_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "remove_ip_address" + ] + if ip_item: + content = DataConvert().ip_convert( + text=content, repl=ip_item[0].get("repl", "xxxx") + ) + + # process for remove phone in text + phone_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "remove_phone" + ] + if phone_item: + content = DataConvert().phone_convert( + text=content, repl=phone_item[0].get("repl", "xxxx") + ) + + # process for remove id card in text + id_card_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "remove_id_card" + ] + if id_card_item: + content = DataConvert().id_card_convert( + text=content, repl=id_card_item[0].get("repl", "xxxx") + ) + + # process for remove weixin in text + weixin_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "remove_weixin" + ] + if weixin_item: + content = DataConvert().weixin_convert( + text=content, repl=weixin_item[0].get("repl", "xxxx") + ) + + # process for remove bank card in text + bank_card_item = [ + item + for item in self._data_cleaning_config + if item.get("type") == "remove_bank_card" + ] + if bank_card_item: + content = DataConvert().bank_card_convert( + text=content, repl=bank_card_item[0].get("repl", "xxxx") + ) + + # generate qa + logger.info("start generate qa") + llm_type = self._llm_config.get("type") + if llm_type == "openai": + # generate QA list by openai + qa_provider = QAProviderOpenAI( + api_key=self._llm_config.get("api_key"), + base_url=self._llm_config.get("base_url"), + model=self._llm_config.get("model"), + temperature=self._llm_config.get("temperature"), + max_tokens=self._llm_config.get("max_tokens"), + ) + elif llm_type == "zhipuai": + # generate QA list by zhipuai + qa_provider = QAProviderZhiPuAIOnline( + api_key=self._llm_config.get("api_key"), + model=self._llm_config.get("model"), + temperature=self._llm_config.get("temperature"), + top_p=self._llm_config.get("top_p"), + ) + else: + return {"status": 1000, "message": "暂时不支持该类型的模型", "data": ""} + + data = qa_provider.generate_qa_list( + text=content, + prompt_template=self._llm_config.get("prompt_template"), + retry_count=self._llm_config.get("retry_count"), + retry_wait_seconds=self._llm_config.get("retry_wait_seconds"), + ) + if data.get("status") != 200: + return data + + for qa in data.get("data"): + qa_list.append( + [ + qa[0], + qa[1], + document.metadata.get("source"), + document.metadata.get("page"), + document.page_content.replace("\n", ""), + ] + ) + logger.info("generate qa finished") + + return {"status": 200, "message": "", "data": qa_list} diff --git a/libs/core/kubeagi_core/embeddings/__init__.py b/libs/core/kubeagi_core/embeddings/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/core/kubeagi_core/embeddings/openai.py b/libs/core/kubeagi_core/embeddings/openai.py new file mode 100644 index 0000000..261de06 --- /dev/null +++ b/libs/core/kubeagi_core/embeddings/openai.py @@ -0,0 +1,44 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from langchain_community.embeddings import OpenAIEmbeddings +from typing import List + + +class OpenAIEmbedding: + def __init__( + self, + base_url: str, + api_key: str, + model: str, + ): + """OpenAI embedding models. + + Args: + base_url (str): to support OpenAI Service custom endpoints. + api_key (str): to support OpenAI Service API KEY. + model (str): Embeddings Model. + """ + self._embeddings = OpenAIEmbeddings( + api_key=api_key, + base_url=base_url, + model=model, + ) + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + return self._embeddings.embed_documents(texts=texts) + + def embed_query(self, text: str) -> List[float]: + return self._embeddings.embed_query(text=text) diff --git a/libs/core/kubeagi_core/ops/__init__.py b/libs/core/kubeagi_core/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/core/kubeagi_core/ops/special_characters.py b/libs/core/kubeagi_core/ops/special_characters.py new file mode 100644 index 0000000..9d4ff34 --- /dev/null +++ b/libs/core/kubeagi_core/ops/special_characters.py @@ -0,0 +1,49 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import emoji + +# referenced from https://github.com/alibaba/data-juicer/blob/main/data_juicer/ops/common/special_characters.py#L26 + +# special characters +EMOJI = list(emoji.EMOJI_DATA.keys()) + +# various whitespaces for whitespace normalization +# whitespaces in unicode can be found here: +# https://en.wikipedia.org/wiki/Whitespace_character +VARIOUS_WHITESPACES = { + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "​", + "‌", + "‍", + "⁠", + "", + "„", +} diff --git a/libs/core/kubeagi_core/ops/transform.py b/libs/core/kubeagi_core/ops/transform.py new file mode 100644 index 0000000..c67d56c --- /dev/null +++ b/libs/core/kubeagi_core/ops/transform.py @@ -0,0 +1,227 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import opencc +import re +import ftfy + +from selectolax.parser import HTMLParser +from kubeagi_core.ops import special_characters + + +class DataConvert: + def chinese_convert(self, text, mode: str = "t2s"): + """ + :param mode: Choose the mode to convert Chinese, + s2t: Simplified Chinese to Traditional Chinese, + t2s: Traditional Chinese to Simplified Chinese, + s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard), + tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese, + s2hk: Simplified Chinese to Traditional Chinese (Hong Kong variant), + hk2s: Traditional Chinese (Hong Kong variant) to Simplified Chinese, + s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) + with Taiwanese idiom, + tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese + with Mainland Chinese idiom, + t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard), + tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese, + hk2t: Traditional Chinese (Hong Kong variant) to Traditional Chinese, + t2hk: Traditional Chinese to Traditional Chinese (Hong Kong variant), + t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji, + jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese Characters, + """ + mode_list = [ + "s2t", + "t2s", + "s2tw", + "tw2s", + "s2hk", + "hk2s", + "s2twp", + "tw2sp", + "t2tw", + "tw2t", + "hk2t", + "t2hk", + "t2jp", + "jp2t", + ] + assert mode in mode_list, "Please make sure mode is one of {}".format(mode_list) + + clean_text = opencc.OpenCC(mode).convert(text) + return clean_text + + def bank_card_convert(self, text, pattern: str = None, repl: str = ""): + """convert bank card in text.""" + if pattern is None: + pattern = r"\b([1-9]{1})(\d{15}|\d{18})(?![0-9])" + + if not re.search(pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub(pattern=pattern, repl=repl, string=text, flags=re.DOTALL) + return clean_text + + def email_convert(self, text, pattern: str = None, repl: str = ""): + """convert email in text.""" + if pattern is None: + pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" + + if not re.search(pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub(pattern=pattern, repl=repl, string=text, flags=re.DOTALL) + return clean_text + + def emojis_convert(self, text, pattern: str = None, repl: str = ""): + """convert emojis in text.""" + if pattern is None: + emojis = special_characters.EMOJI + pattern = "|".join(re.escape(value) for value in emojis) + + if not re.search(pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub(pattern=pattern, repl=repl, string=text, flags=re.DOTALL) + return clean_text + + def id_card_convert(self, text, repl: str = ""): + """convert id card in text.""" + pattern = [ + r"\b([1-9]\d{5}[1-9]\d{3})((0\d)|(1[0-2]))(([0|1|2]\d)|(3[0-1]))(\d{3}[0-9Xx])(?![0-9])", + r"\b([1-9]\d{7})((0\d)|(1[0-2]))(([0-2][1-9])|(3[0-1]))(\d{2}[0-9Xx])(?![0-9])", + ] + + for regex_exp in pattern: + text = re.sub(pattern=regex_exp, repl=repl, string=text, flags=re.DOTALL) + return text + + def ip_convert(self, text, pattern: str = None, repl: str = ""): + """convert ip in text.""" + if pattern is None: + pattern = "".join( + [ + r"((?:(?:1[0-9][0-9]\.)|(?:2[0-4][0-9]\.)|", + r"(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))", + r"{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|", + r"(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|", + r"([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4})", + ] + ) + + if not re.search(pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub(pattern=pattern, repl=repl, string=text, flags=re.DOTALL) + return clean_text + + def phone_convert(self, text, pattern: str = None, repl: str = ""): + """convert ip in text.""" + if pattern is None: + pattern = r"((\+|00)86)?(1)((3[\d])|(4[5,6,7,9])|(5[0-3,5-9])|(6[5-7])|(7[0-8])|(8[\d])|(9[1,8,9]))(\d{8})(?![0-9])" + + if not re.search(pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub(pattern=pattern, repl=repl, string=text, flags=re.DOTALL) + return clean_text + + def weixin_convert(self, text, pattern: str = None, repl: str = ""): + """convert weixin in text.""" + if pattern is None: + pattern = [ + r"vxin[:|:][a-zA-Z0-9{3,20}]+", + r"vx[:|:][a-zA-Z0-9{3,20}]+", + r"VX[:|:][a-zA-Z0-9{3,20}]+", + r"Vxin[:|:][a-zA-Z0-9{3,20}]+", + r"wx[:|:][a-zA-Z0-9{3,20}]+", + r"WX[:|:][a-zA-Z0-9{3,20}]+", + r"wei xin[:|:][a-zA-Z0-9{3,20}]+", + r"weixin[:|:][a-zA-Z0-9{3,20}]+", + r"微信[:|:][a-zA-Z0-9{3,20}]+", + r"微信号[:|:][a-zA-Z0-9{3,20}]+", + r"薇信[:|:][a-zA-Z0-9{3,20}]+", + r"薇信号[:|:][a-zA-Z0-9{3,20}]+", + r"v信[:|:][a-zA-Z0-9{3,20}]+", + r"V信[:|:][a-zA-Z0-9{3,20}]+", + ] + + for regex_exp in pattern: + text = re.sub(pattern=regex_exp, repl=repl, string=text, flags=re.DOTALL) + return text + + def invisible_characters_convert(self, text, pattern: str = None, repl: str = ""): + """convert invisible characters in text.""" + if pattern is None: + pattern = r"[\x00-\x1F\x7F-\x9F\xAD\r\t\b\x0B\x1C\x1D\x1E]" + + if not re.search(pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub(pattern=pattern, repl=repl, string=text, flags=re.DOTALL) + return clean_text + + def space_convert(self, text, pattern: str = None, repl: str = " "): + """convert space in text.""" + if pattern is None: + various_whitespaces = special_characters.VARIOUS_WHITESPACES + pattern = "|".join(re.escape(value) for value in various_whitespaces) + + if not re.search(pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub(pattern=pattern, repl=repl, string=text, flags=re.DOTALL) + return clean_text + + +class Clean: + def clean_html(self, text): + """clean html in text.""" + text = text.replace("
  • ", "\n*") + text = text.replace("
  • ", "") + text = text.replace("
      ", "\n*") + text = text.replace("
    ", "") + parser = HTMLParser(text) + + clean_text = parser.text() + return clean_text + + +class FixUnicode: + """fix unicode errors in text.""" + + def __init__(self, normalization: str = None): + """ + Initialization method. + + :param normalization: the specified form of Unicode + normalization mode, which can be one of ['NFC', + 'NFKC', 'NFD', and 'NFKD'], default 'NFC' + """ + if normalization and len(normalization) > 0: + self._normalization = normalization.upper() + else: + self._normalization = "NFC" + + if self._normalization.upper() not in ["NFC", "NFKC", "NFD", "NFKD"]: + raise ValueError( + f"Normalization mode [{normalization}] is not " + "supported. Can only be one of " + '["NFC", "NFKC", "NFD", "NFKD"]' + ) + + def process(self, text): + clean_text = ftfy.fix_text(text, normalization=self._normalization) + return clean_text diff --git a/libs/core/pyproject.toml b/libs/core/pyproject.toml index 9995154..422060d 100644 --- a/libs/core/pyproject.toml +++ b/libs/core/pyproject.toml @@ -17,9 +17,13 @@ classifiers = [ ] dependencies = [ "docx2txt==0.8", + "emoji==2.2.0", + "ftfy==6.1.1", "kubernetes==25.3.0", "langchain==0.1.0", + "opencc-python-reimplemented==0.1.7", "ragas==0.0.22", + "selectolax==0.3.17", "spacy==3.5.4", "zhipuai==1.0.7", ]