From 4532904c5bd6d685592e2541f7acec4e7276debc Mon Sep 17 00:00:00 2001 From: wangxinbiao <1412146116@qq.com> Date: Mon, 4 Mar 2024 15:07:03 +0800 Subject: [PATCH] feat:add library to data process --- libs/core/kubeagi_core/ops/__init__.py | 0 libs/core/kubeagi_core/ops/common/__init__.py | 0 .../ops/common/special_characters.py | 49 ++++++++++++++ libs/core/kubeagi_core/ops/mapper/__init__.py | 44 +++++++++++++ .../ops/mapper/chinese_convert.py | 66 +++++++++++++++++++ .../ops/mapper/clean_bank_card.py | 37 +++++++++++ .../kubeagi_core/ops/mapper/clean_email.py | 37 +++++++++++ .../kubeagi_core/ops/mapper/clean_emoji.py | 40 +++++++++++ .../kubeagi_core/ops/mapper/clean_html.py | 30 +++++++++ .../kubeagi_core/ops/mapper/clean_id_card.py | 34 ++++++++++ libs/core/kubeagi_core/ops/mapper/clean_ip.py | 45 +++++++++++++ .../kubeagi_core/ops/mapper/clean_phone.py | 37 +++++++++++ .../kubeagi_core/ops/mapper/clean_weixin.py | 46 +++++++++++++ .../kubeagi_core/ops/mapper/fix_unicode.py | 44 +++++++++++++ .../ops/mapper/remove_invisible_characters.py | 37 +++++++++++ .../ops/mapper/space_standardization.py | 40 +++++++++++ libs/core/pyproject.toml | 4 ++ 17 files changed, 590 insertions(+) create mode 100644 libs/core/kubeagi_core/ops/__init__.py create mode 100644 libs/core/kubeagi_core/ops/common/__init__.py create mode 100644 libs/core/kubeagi_core/ops/common/special_characters.py create mode 100644 libs/core/kubeagi_core/ops/mapper/__init__.py create mode 100644 libs/core/kubeagi_core/ops/mapper/chinese_convert.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_bank_card.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_email.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_emoji.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_html.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_id_card.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_ip.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_phone.py create mode 100644 libs/core/kubeagi_core/ops/mapper/clean_weixin.py create mode 100644 libs/core/kubeagi_core/ops/mapper/fix_unicode.py create mode 100644 libs/core/kubeagi_core/ops/mapper/remove_invisible_characters.py create mode 100644 libs/core/kubeagi_core/ops/mapper/space_standardization.py diff --git a/libs/core/kubeagi_core/ops/__init__.py b/libs/core/kubeagi_core/ops/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/core/kubeagi_core/ops/common/__init__.py b/libs/core/kubeagi_core/ops/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/core/kubeagi_core/ops/common/special_characters.py b/libs/core/kubeagi_core/ops/common/special_characters.py new file mode 100644 index 0000000..9d4ff34 --- /dev/null +++ b/libs/core/kubeagi_core/ops/common/special_characters.py @@ -0,0 +1,49 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import emoji + +# referenced from https://github.com/alibaba/data-juicer/blob/main/data_juicer/ops/common/special_characters.py#L26 + +# special characters +EMOJI = list(emoji.EMOJI_DATA.keys()) + +# various whitespaces for whitespace normalization +# whitespaces in unicode can be found here: +# https://en.wikipedia.org/wiki/Whitespace_character +VARIOUS_WHITESPACES = { + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + "", + "", + "", + "", + "", + "", +} diff --git a/libs/core/kubeagi_core/ops/mapper/__init__.py b/libs/core/kubeagi_core/ops/mapper/__init__.py new file mode 100644 index 0000000..c1cbaf2 --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/__init__.py @@ -0,0 +1,44 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from kubeagi_core.ops.mapper.chinese_convert import ChineseConvert +from kubeagi_core.ops.mapper.clean_bank_card import CleanBankCard +from kubeagi_core.ops.mapper.clean_email import CleanEmail +from kubeagi_core.ops.mapper.clean_emoji import CleanEmoji +from kubeagi_core.ops.mapper.clean_html import CleanHtml +from kubeagi_core.ops.mapper.clean_id_card import CleanIdCard +from kubeagi_core.ops.mapper.clean_ip import CleanIp +from kubeagi_core.ops.mapper.clean_phone import CleanPhone +from kubeagi_core.ops.mapper.clean_weixin import CleanWeixin +from kubeagi_core.ops.mapper.fix_unicode import FixUnicode +from kubeagi_core.ops.mapper.remove_invisible_characters import ( + RemoveInvisibleCharacters, +) +from kubeagi_core.ops.mapper.space_standardization import SpaceStandardization + +__all__ = [ + "ChineseConvert", + "CleanBankCard", + "CleanEmail", + "CleanEmoji", + "CleanHtml", + "CleanIdCard", + "CleanIp", + "CleanPhone", + "CleanWeixin", + "FixUnicode", + "RemoveInvisibleCharacters", + "SpaceStandardization", +] diff --git a/libs/core/kubeagi_core/ops/mapper/chinese_convert.py b/libs/core/kubeagi_core/ops/mapper/chinese_convert.py new file mode 100644 index 0000000..fcc159f --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/chinese_convert.py @@ -0,0 +1,66 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import opencc + + +class ChineseConvert: + """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese""" + + def __init__(self, mode: str = "t2s"): + """ + Initialization method. + + :param mode: Choose the mode to convert Chinese, + s2t: Simplified Chinese to Traditional Chinese, + t2s: Traditional Chinese to Simplified Chinese, + s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard), + tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese, + s2hk: Simplified Chinese to Traditional Chinese (Hong Kong variant), + hk2s: Traditional Chinese (Hong Kong variant) to Simplified Chinese, + s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) + with Taiwanese idiom, + tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese + with Mainland Chinese idiom, + t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard), + tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese, + hk2t: Traditional Chinese (Hong Kong variant) to Traditional Chinese, + t2hk: Traditional Chinese to Traditional Chinese (Hong Kong variant), + t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji, + jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese Characters, + """ + mode_list = [ + "s2t", + "t2s", + "s2tw", + "tw2s", + "s2hk", + "hk2s", + "s2twp", + "tw2sp", + "t2tw", + "tw2t", + "hk2t", + "t2hk", + "t2jp", + "jp2t", + ] + assert mode in mode_list, "Please make sure mode is one of {}".format(mode_list) + + self._opencc_convert = opencc.OpenCC(mode) + + def process(self, text): + clean_text = self._opencc_convert.convert(text) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_bank_card.py b/libs/core/kubeagi_core/ops/mapper/clean_bank_card.py new file mode 100644 index 0000000..66cd3dc --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_bank_card.py @@ -0,0 +1,37 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + + +class CleanBankCard: + """Mapper to clean bank card in text.""" + + def __init__(self, pattern: str = None, repl: str = ""): + if pattern is None: + self._pattern = r"\b([1-9]{1})(\d{15}|\d{18})(?![0-9])" + else: + self._pattern = pattern + + self._repl = repl + + def process(self, text): + if not re.search(self._pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub( + pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL + ) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_email.py b/libs/core/kubeagi_core/ops/mapper/clean_email.py new file mode 100644 index 0000000..dffd5ec --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_email.py @@ -0,0 +1,37 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + + +class CleanEmail: + """Mapper to clean email in text.""" + + def __init__(self, pattern: str = None, repl: str = ""): + if pattern is None: + self._pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" + else: + self._pattern = pattern + + self._repl = repl + + def process(self, text): + if not re.search(self._pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub( + pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL + ) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_emoji.py b/libs/core/kubeagi_core/ops/mapper/clean_emoji.py new file mode 100644 index 0000000..91c4b77 --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_emoji.py @@ -0,0 +1,40 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re + +from kubeagi_core.ops.common import special_characters + + +class CleanEmoji: + """Mapper to clean emojis in text.""" + + def __init__(self, pattern: str = None, repl: str = ""): + if pattern is None: + emojis = special_characters.EMOJI + self._pattern = "|".join(re.escape(value) for value in emojis) + else: + self._pattern = pattern + + self._repl = repl + + def process(self, text): + if not re.search(self._pattern, text, flags=re.DOTALL): + return text + + clean_text = re.sub( + pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL + ) + return clean_text diff --git a/libs/core/kubeagi_core/ops/mapper/clean_html.py b/libs/core/kubeagi_core/ops/mapper/clean_html.py new file mode 100644 index 0000000..a2a1a91 --- /dev/null +++ b/libs/core/kubeagi_core/ops/mapper/clean_html.py @@ -0,0 +1,30 @@ +# Copyright 2024 KubeAGI. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from selectolax.parser import HTMLParser + + +class CleanHtml: + """Mapper to clean html in text.""" + + def process(self, text): + text = text.replace("