Skip to content

Commit

Permalink
feat:add library to data process
Browse files Browse the repository at this point in the history
  • Loading branch information
wangxinbiao committed Mar 4, 2024
1 parent 1986793 commit 4532904
Show file tree
Hide file tree
Showing 17 changed files with 590 additions and 0 deletions.
Empty file.
Empty file.
49 changes: 49 additions & 0 deletions libs/core/kubeagi_core/ops/common/special_characters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import emoji

# referenced from https://github.com/alibaba/data-juicer/blob/main/data_juicer/ops/common/special_characters.py#L26

# special characters
EMOJI = list(emoji.EMOJI_DATA.keys())

# various whitespaces for whitespace normalization
# whitespaces in unicode can be found here:
# https://en.wikipedia.org/wiki/Whitespace_character
VARIOUS_WHITESPACES = {
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
" ",
"​",
"‌",
"‍",
"⁠",
"",
"„",
}
44 changes: 44 additions & 0 deletions libs/core/kubeagi_core/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from kubeagi_core.ops.mapper.chinese_convert import ChineseConvert
from kubeagi_core.ops.mapper.clean_bank_card import CleanBankCard
from kubeagi_core.ops.mapper.clean_email import CleanEmail
from kubeagi_core.ops.mapper.clean_emoji import CleanEmoji
from kubeagi_core.ops.mapper.clean_html import CleanHtml
from kubeagi_core.ops.mapper.clean_id_card import CleanIdCard
from kubeagi_core.ops.mapper.clean_ip import CleanIp
from kubeagi_core.ops.mapper.clean_phone import CleanPhone
from kubeagi_core.ops.mapper.clean_weixin import CleanWeixin
from kubeagi_core.ops.mapper.fix_unicode import FixUnicode
from kubeagi_core.ops.mapper.remove_invisible_characters import (
RemoveInvisibleCharacters,
)
from kubeagi_core.ops.mapper.space_standardization import SpaceStandardization

__all__ = [
"ChineseConvert",
"CleanBankCard",
"CleanEmail",
"CleanEmoji",
"CleanHtml",
"CleanIdCard",
"CleanIp",
"CleanPhone",
"CleanWeixin",
"FixUnicode",
"RemoveInvisibleCharacters",
"SpaceStandardization",
]
66 changes: 66 additions & 0 deletions libs/core/kubeagi_core/ops/mapper/chinese_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import opencc


class ChineseConvert:
"""Mapper to convert Chinese between Traditional Chinese, Simplified Chinese"""

def __init__(self, mode: str = "t2s"):
"""
Initialization method.
:param mode: Choose the mode to convert Chinese,
s2t: Simplified Chinese to Traditional Chinese,
t2s: Traditional Chinese to Simplified Chinese,
s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard),
tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese,
s2hk: Simplified Chinese to Traditional Chinese (Hong Kong variant),
hk2s: Traditional Chinese (Hong Kong variant) to Simplified Chinese,
s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard)
with Taiwanese idiom,
tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese
with Mainland Chinese idiom,
t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard),
tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese,
hk2t: Traditional Chinese (Hong Kong variant) to Traditional Chinese,
t2hk: Traditional Chinese to Traditional Chinese (Hong Kong variant),
t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji,
jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese Characters,
"""
mode_list = [
"s2t",
"t2s",
"s2tw",
"tw2s",
"s2hk",
"hk2s",
"s2twp",
"tw2sp",
"t2tw",
"tw2t",
"hk2t",
"t2hk",
"t2jp",
"jp2t",
]
assert mode in mode_list, "Please make sure mode is one of {}".format(mode_list)

self._opencc_convert = opencc.OpenCC(mode)

def process(self, text):
clean_text = self._opencc_convert.convert(text)
return clean_text
37 changes: 37 additions & 0 deletions libs/core/kubeagi_core/ops/mapper/clean_bank_card.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import re


class CleanBankCard:
"""Mapper to clean bank card in text."""

def __init__(self, pattern: str = None, repl: str = ""):
if pattern is None:
self._pattern = r"\b([1-9]{1})(\d{15}|\d{18})(?![0-9])"
else:
self._pattern = pattern

self._repl = repl

def process(self, text):
if not re.search(self._pattern, text, flags=re.DOTALL):
return text

clean_text = re.sub(
pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL
)
return clean_text
37 changes: 37 additions & 0 deletions libs/core/kubeagi_core/ops/mapper/clean_email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import re


class CleanEmail:
"""Mapper to clean email in text."""

def __init__(self, pattern: str = None, repl: str = ""):
if pattern is None:
self._pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
else:
self._pattern = pattern

self._repl = repl

def process(self, text):
if not re.search(self._pattern, text, flags=re.DOTALL):
return text

clean_text = re.sub(
pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL
)
return clean_text
40 changes: 40 additions & 0 deletions libs/core/kubeagi_core/ops/mapper/clean_emoji.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import re

from kubeagi_core.ops.common import special_characters


class CleanEmoji:
"""Mapper to clean emojis in text."""

def __init__(self, pattern: str = None, repl: str = ""):
if pattern is None:
emojis = special_characters.EMOJI
self._pattern = "|".join(re.escape(value) for value in emojis)
else:
self._pattern = pattern

self._repl = repl

def process(self, text):
if not re.search(self._pattern, text, flags=re.DOTALL):
return text

clean_text = re.sub(
pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL
)
return clean_text
30 changes: 30 additions & 0 deletions libs/core/kubeagi_core/ops/mapper/clean_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from selectolax.parser import HTMLParser


class CleanHtml:
"""Mapper to clean html in text."""

def process(self, text):
text = text.replace("<li>", "\n*")
text = text.replace("</li>", "")
text = text.replace("<ol>", "\n*")
text = text.replace("</ol>", "")
parser = HTMLParser(text)

clean_text = parser.text()
return clean_text
34 changes: 34 additions & 0 deletions libs/core/kubeagi_core/ops/mapper/clean_id_card.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import re


class CleanIdCard:
"""Mapper to clean id card in text."""

def __init__(self, repl: str = ""):
self._pattern = [
r"\b([1-9]\d{5}[1-9]\d{3})((0\d)|(1[0-2]))(([0|1|2]\d)|(3[0-1]))(\d{3}[0-9Xx])(?![0-9])",
r"\b([1-9]\d{7})((0\d)|(1[0-2]))(([0-2][1-9])|(3[0-1]))(\d{2}[0-9Xx])(?![0-9])",
]
self._repl = repl

def process(self, text):
for regex_exp in self._pattern:
text = re.sub(
pattern=regex_exp, repl=self._repl, string=text, flags=re.DOTALL
)
return text
45 changes: 45 additions & 0 deletions libs/core/kubeagi_core/ops/mapper/clean_ip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import re


class CleanIp:
"""Mapper to clean ipv4 and ipv6 address in text."""

def __init__(self, pattern: str = None, repl: str = ""):
if pattern is None:
self._pattern = "".join(
[
r"((?:(?:1[0-9][0-9]\.)|(?:2[0-4][0-9]\.)|",
r"(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))",
r"{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|",
r"(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|",
r"([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4})",
]
)
else:
self._pattern = pattern

self._repl = repl

def process(self, text):
if not re.search(self._pattern, text, flags=re.DOTALL):
return text

clean_text = re.sub(
pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL
)
return clean_text
Loading

0 comments on commit 4532904

Please sign in to comment.