From 4532904c5bd6d685592e2541f7acec4e7276debc Mon Sep 17 00:00:00 2001 From: wangxinbiao <1412146116@qq.com> Date: Mon, 4 Mar 2024 15:07:03 +0800 Subject: [PATCH] feat:add library to data process --- libs/core/kubeagi_core/ops/__init__.py | 0 libs/core/kubeagi_core/ops/common/__init__.py | 0 .../ops/common/special_characters.py | 49 ++++++++++++++ libs/core/kubeagi_core/ops/mapper/__init__.py | 44 +++++++++++++ .../ops/mapper/chinese_convert.py | 66 +++++++++++++++++++ .../ops/mapper/clean_bank_card.py | 37 +++++++++++ .../kubeagi_core/ops/mapper/clean_email.py | 37 +++++++++++ .../kubeagi_core/ops/mapper/clean_emoji.py | 40 +++++++++++ .../kubeagi_core/ops/mapper/clean_html.py | 30 +++++++++ .../kubeagi_core/ops/mapper/clean_id_card.py | 34 ++++++++++ libs/core/kubeagi_core/ops/mapper/clean_ip.py | 45 +++++++++++++ .../kubeagi_core/ops/mapper/clean_phone.py | 37 +++++++++++ .../kubeagi_core/ops/mapper/clean_weixin.py | 46 +++++++++++++ .../kubeagi_core/ops/mapper/fix_unicode.py | 44 +++++++++++++ .../ops/mapper/remove_invisible_characters.py | 37 +++++++++++ .../ops/mapper/space_standardization.py | 40 +++++++++++ libs/core/pyproject.toml | 4 ++ 17 files changed, 590 insertions(+) create mode 100644 libs/core/kubeagi_core/ops/__init__.py create mode 100644 libs/core/kubeagi_core/ops/common/__init__.py create mode 100644 libs/core/kubeagi_core/ops/common/special_characters.py create mode 100644 libs/core/kubeagi_core/ops/mapper/__init__.py create mode 100644 libs/core/kubeagi_core/ops/mapper/chinese_convert.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_bank_card.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_email.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_emoji.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_html.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_id_card.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_ip.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_phone.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_weixin.py create mode 100644 libs/core/kubeagi_core/ops/mapper/fix_unicode.py create mode 100644 libs/core/kubeagi_core/ops/mapper/remove_invisible_characters.py create mode 100644 libs/core/kubeagi_core/ops/mapper/space_standardization.py diff --git a/libs/core/kubeagi_core/ops/__init__.py b/libs/core/kubeagi_core/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/core/kubeagi_core/ops/common/__init__.py b/libs/core/kubeagi_core/ops/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/core/kubeagi_core/ops/common/special_characters.py b/libs/core/kubeagi_core/ops/common/special_characters.py new file mode 100644 index 0000000..9d4ff34 --- /dev/null +++ b/libs/core/kubeagi_core/ops/common/special_characters.py @@ -0,0 +1,49 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import emoji + +# referenced from https://github.com/alibaba/data-juicer/blob/main/data_juicer/ops/common/special_characters.py#L26 + +# special characters +EMOJI = list(emoji.EMOJI_DATA.keys()) + +# various whitespaces for whitespace normalization +# whitespaces in unicode can be found here: +# https://en.wikipedia.org/wiki/Whitespace_character +VARIOUS_WHITESPACES = { + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "​", + "‌", + "‍", + "⁠", + "", + "„", +} diff --git a/libs/core/kubeagi_core/ops/mapper/__init__.py b/libs/core/kubeagi_core/ops/mapper/__init__.py new file mode 100644 index 0000000..c1cbaf2 --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/__init__.py @@ -0,0 +1,44 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from kubeagi_core.ops.mapper.chinese_convert import ChineseConvert +from kubeagi_core.ops.mapper.clean_bank_card import CleanBankCard +from kubeagi_core.ops.mapper.clean_email import CleanEmail +from kubeagi_core.ops.mapper.clean_emoji import CleanEmoji +from kubeagi_core.ops.mapper.clean_html import CleanHtml +from kubeagi_core.ops.mapper.clean_id_card import CleanIdCard +from kubeagi_core.ops.mapper.clean_ip import CleanIp +from kubeagi_core.ops.mapper.clean_phone import CleanPhone +from kubeagi_core.ops.mapper.clean_weixin import CleanWeixin +from kubeagi_core.ops.mapper.fix_unicode import FixUnicode +from kubeagi_core.ops.mapper.remove_invisible_characters import ( + RemoveInvisibleCharacters, +) +from kubeagi_core.ops.mapper.space_standardization import SpaceStandardization + +__all__ = [ + "ChineseConvert", + "CleanBankCard", + "CleanEmail", + "CleanEmoji", + "CleanHtml", + "CleanIdCard", + "CleanIp", + "CleanPhone", + "CleanWeixin", + "FixUnicode", + "RemoveInvisibleCharacters", + "SpaceStandardization", +] diff --git a/libs/core/kubeagi_core/ops/mapper/chinese_convert.py b/libs/core/kubeagi_core/ops/mapper/chinese_convert.py new file mode 100644 index 0000000..fcc159f --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/chinese_convert.py @@ -0,0 +1,66 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import opencc + + +class ChineseConvert: + """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese""" + + def __init__(self, mode: str = "t2s"): + """ + Initialization method. + + :param mode: Choose the mode to convert Chinese, + s2t: Simplified Chinese to Traditional Chinese, + t2s: Traditional Chinese to Simplified Chinese, + s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard), + tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese, + s2hk: Simplified Chinese to Traditional Chinese (Hong Kong variant), + hk2s: Traditional Chinese (Hong Kong variant) to Simplified Chinese, + s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) + with Taiwanese idiom, + tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese + with Mainland Chinese idiom, + t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard), + tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese, + hk2t: Traditional Chinese (Hong Kong variant) to Traditional Chinese, + t2hk: Traditional Chinese to Traditional Chinese (Hong Kong variant), + t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji, + jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese Characters, + """ + mode_list = [ + "s2t", + "t2s", + "s2tw", + "tw2s", + "s2hk", + "hk2s", + "s2twp", + "tw2sp", + "t2tw", + "tw2t", + "hk2t", + "t2hk", + "t2jp", + "jp2t", + ] + assert mode in mode_list, "Please make sure mode is one of {}".format(mode_list) + + self._opencc_convert = opencc.OpenCC(mode) + + def process(self, text): + clean_text = self._opencc_convert.convert(text) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_bank_card.py b/libs/core/kubeagi_core/ops/mapper/clean_bank_card.py new file mode 100644 index 0000000..66cd3dc --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_bank_card.py @@ -0,0 +1,37 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + + +class CleanBankCard: + """Mapper to clean bank card in text.""" + + def __init__(self, pattern: str = None, repl: str = ""): + if pattern is None: + self._pattern = r"\b([1-9]{1})(\d{15}|\d{18})(?![0-9])" + else: + self._pattern = pattern + + self._repl = repl + + def process(self, text): + if not re.search(self._pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub( + pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL + ) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_email.py b/libs/core/kubeagi_core/ops/mapper/clean_email.py new file mode 100644 index 0000000..dffd5ec --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_email.py @@ -0,0 +1,37 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + + +class CleanEmail: + """Mapper to clean email in text.""" + + def __init__(self, pattern: str = None, repl: str = ""): + if pattern is None: + self._pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" + else: + self._pattern = pattern + + self._repl = repl + + def process(self, text): + if not re.search(self._pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub( + pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL + ) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_emoji.py b/libs/core/kubeagi_core/ops/mapper/clean_emoji.py new file mode 100644 index 0000000..91c4b77 --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_emoji.py @@ -0,0 +1,40 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + +from kubeagi_core.ops.common import special_characters + + +class CleanEmoji: + """Mapper to clean emojis in text.""" + + def __init__(self, pattern: str = None, repl: str = ""): + if pattern is None: + emojis = special_characters.EMOJI + self._pattern = "|".join(re.escape(value) for value in emojis) + else: + self._pattern = pattern + + self._repl = repl + + def process(self, text): + if not re.search(self._pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub( + pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL + ) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_html.py b/libs/core/kubeagi_core/ops/mapper/clean_html.py new file mode 100644 index 0000000..a2a1a91 --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_html.py @@ -0,0 +1,30 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from selectolax.parser import HTMLParser + + +class CleanHtml: + """Mapper to clean html in text.""" + + def process(self, text): + text = text.replace("
  • ", "\n*") + text = text.replace("
  • ", "") + text = text.replace("
      ", "\n*") + text = text.replace("
    ", "") + parser = HTMLParser(text) + + clean_text = parser.text() + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_id_card.py b/libs/core/kubeagi_core/ops/mapper/clean_id_card.py new file mode 100644 index 0000000..2742a23 --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_id_card.py @@ -0,0 +1,34 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + + +class CleanIdCard: + """Mapper to clean id card in text.""" + + def __init__(self, repl: str = ""): + self._pattern = [ + r"\b([1-9]\d{5}[1-9]\d{3})((0\d)|(1[0-2]))(([0|1|2]\d)|(3[0-1]))(\d{3}[0-9Xx])(?![0-9])", + r"\b([1-9]\d{7})((0\d)|(1[0-2]))(([0-2][1-9])|(3[0-1]))(\d{2}[0-9Xx])(?![0-9])", + ] + self._repl = repl + + def process(self, text): + for regex_exp in self._pattern: + text = re.sub( + pattern=regex_exp, repl=self._repl, string=text, flags=re.DOTALL + ) + return text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_ip.py b/libs/core/kubeagi_core/ops/mapper/clean_ip.py new file mode 100644 index 0000000..c666dbe --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_ip.py @@ -0,0 +1,45 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + + +class CleanIp: + """Mapper to clean ipv4 and ipv6 address in text.""" + + def __init__(self, pattern: str = None, repl: str = ""): + if pattern is None: + self._pattern = "".join( + [ + r"((?:(?:1[0-9][0-9]\.)|(?:2[0-4][0-9]\.)|", + r"(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))", + r"{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|", + r"(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|", + r"([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4})", + ] + ) + else: + self._pattern = pattern + + self._repl = repl + + def process(self, text): + if not re.search(self._pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub( + pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL + ) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_phone.py b/libs/core/kubeagi_core/ops/mapper/clean_phone.py new file mode 100644 index 0000000..035eac2 --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_phone.py @@ -0,0 +1,37 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + + +class CleanPhone: + """Mapper to clean phone in text.""" + + def __init__(self, pattern: str = None, repl: str = ""): + if pattern is None: + self._pattern = r"((\+|00)86)?(1)((3[\d])|(4[5,6,7,9])|(5[0-3,5-9])|(6[5-7])|(7[0-8])|(8[\d])|(9[1,8,9]))(\d{8})(?![0-9])" + else: + self._pattern = pattern + + self._repl = repl + + def process(self, text): + if not re.search(self._pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub( + pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL + ) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_weixin.py b/libs/core/kubeagi_core/ops/mapper/clean_weixin.py new file mode 100644 index 0000000..56b84ef --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_weixin.py @@ -0,0 +1,46 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + + +class CleanWeixin: + """Mapper to clean weixin in text.""" + + def __init__(self, repl: str = ""): + self._pattern = [ + r"vxin[:|:][a-zA-Z0-9{3,20}]+", + r"vx[:|:][a-zA-Z0-9{3,20}]+", + r"VX[:|:][a-zA-Z0-9{3,20}]+", + r"Vxin[:|:][a-zA-Z0-9{3,20}]+", + r"wx[:|:][a-zA-Z0-9{3,20}]+", + r"WX[:|:][a-zA-Z0-9{3,20}]+", + r"wei xin[:|:][a-zA-Z0-9{3,20}]+", + r"weixin[:|:][a-zA-Z0-9{3,20}]+", + r"微信[:|:][a-zA-Z0-9{3,20}]+", + r"微信号[:|:][a-zA-Z0-9{3,20}]+", + r"薇信[:|:][a-zA-Z0-9{3,20}]+", + r"薇信号[:|:][a-zA-Z0-9{3,20}]+", + r"v信[:|:][a-zA-Z0-9{3,20}]+", + r"V信[:|:][a-zA-Z0-9{3,20}]+", + ] + self._repl = repl + + def process(self, text): + for regex_exp in self._pattern: + text = re.sub( + pattern=regex_exp, repl=self._repl, string=text, flags=re.DOTALL + ) + return text diff --git a/libs/core/kubeagi_core/ops/mapper/fix_unicode.py b/libs/core/kubeagi_core/ops/mapper/fix_unicode.py new file mode 100644 index 0000000..50d46eb --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/fix_unicode.py @@ -0,0 +1,44 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import ftfy + + +class FixUnicode: + """Mapper to fix unicode errors in text.""" + + def __init__(self, normalization: str = None): + """ + Initialization method. + + :param normalization: the specified form of Unicode + normalization mode, which can be one of ['NFC', + 'NFKC', 'NFD', and 'NFKD'], default 'NFC' + """ + if normalization and len(normalization) > 0: + self._normalization = normalization.upper() + else: + self._normalization = "NFC" + + if self._normalization.upper() not in ["NFC", "NFKC", "NFD", "NFKD"]: + raise ValueError( + f"Normalization mode [{normalization}] is not " + "supported. Can only be one of " + '["NFC", "NFKC", "NFD", "NFKD"]' + ) + + def process(self, text): + clean_text = ftfy.fix_text(text, normalization=self._normalization) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/remove_invisible_characters.py b/libs/core/kubeagi_core/ops/mapper/remove_invisible_characters.py new file mode 100644 index 0000000..04c0725 --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/remove_invisible_characters.py @@ -0,0 +1,37 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + + +class RemoveInvisibleCharacters: + """Mapper to remove invisible characters in text.""" + + def __init__(self, pattern: str = None, repl: str = ""): + if pattern is None: + self._pattern = r"[\x00-\x1F\x7F-\x9F\xAD\r\t\b\x0B\x1C\x1D\x1E]" + else: + self._pattern = pattern + + self._repl = repl + + def process(self, text): + if not re.search(self._pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub( + pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL + ) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/space_standardization.py b/libs/core/kubeagi_core/ops/mapper/space_standardization.py new file mode 100644 index 0000000..9c2dae4 --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/space_standardization.py @@ -0,0 +1,40 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + +from kubeagi_core.ops.common import special_characters + + +class SpaceStandardization: + """Mapper to space standardization in text.""" + + def __init__(self, pattern: str = None, repl: str = ""): + if pattern is None: + various_whitespaces = special_characters.VARIOUS_WHITESPACES + self._pattern = "|".join(re.escape(value) for value in various_whitespaces) + else: + self._pattern = pattern + + self._repl = repl + + def process(self, text): + if not re.search(self._pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub( + pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL + ) + return clean_text diff --git a/libs/core/pyproject.toml b/libs/core/pyproject.toml index 9995154..422060d 100644 --- a/libs/core/pyproject.toml +++ b/libs/core/pyproject.toml @@ -17,9 +17,13 @@ classifiers = [ ] dependencies = [ "docx2txt==0.8", + "emoji==2.2.0", + "ftfy==6.1.1", "kubernetes==25.3.0", "langchain==0.1.0", + "opencc-python-reimplemented==0.1.7", "ragas==0.0.22", + "selectolax==0.3.17", "spacy==3.5.4", "zhipuai==1.0.7", ]