-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1986793
commit 4532904
Showing
17 changed files
with
590 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
# Copyright 2024 KubeAGI. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import emoji | ||
|
||
# referenced from https://github.com/alibaba/data-juicer/blob/main/data_juicer/ops/common/special_characters.py#L26 | ||
|
||
# special characters | ||
EMOJI = list(emoji.EMOJI_DATA.keys()) | ||
|
||
# various whitespaces for whitespace normalization | ||
# whitespaces in unicode can be found here: | ||
# https://en.wikipedia.org/wiki/Whitespace_character | ||
VARIOUS_WHITESPACES = { | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
" ", | ||
"", | ||
"", | ||
"", | ||
"", | ||
"", | ||
"", | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# Copyright 2024 KubeAGI. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
from kubeagi_core.ops.mapper.chinese_convert import ChineseConvert | ||
from kubeagi_core.ops.mapper.clean_bank_card import CleanBankCard | ||
from kubeagi_core.ops.mapper.clean_email import CleanEmail | ||
from kubeagi_core.ops.mapper.clean_emoji import CleanEmoji | ||
from kubeagi_core.ops.mapper.clean_html import CleanHtml | ||
from kubeagi_core.ops.mapper.clean_id_card import CleanIdCard | ||
from kubeagi_core.ops.mapper.clean_ip import CleanIp | ||
from kubeagi_core.ops.mapper.clean_phone import CleanPhone | ||
from kubeagi_core.ops.mapper.clean_weixin import CleanWeixin | ||
from kubeagi_core.ops.mapper.fix_unicode import FixUnicode | ||
from kubeagi_core.ops.mapper.remove_invisible_characters import ( | ||
RemoveInvisibleCharacters, | ||
) | ||
from kubeagi_core.ops.mapper.space_standardization import SpaceStandardization | ||
|
||
__all__ = [ | ||
"ChineseConvert", | ||
"CleanBankCard", | ||
"CleanEmail", | ||
"CleanEmoji", | ||
"CleanHtml", | ||
"CleanIdCard", | ||
"CleanIp", | ||
"CleanPhone", | ||
"CleanWeixin", | ||
"FixUnicode", | ||
"RemoveInvisibleCharacters", | ||
"SpaceStandardization", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
# Copyright 2024 KubeAGI. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import opencc | ||
|
||
|
||
class ChineseConvert: | ||
"""Mapper to convert Chinese between Traditional Chinese, Simplified Chinese""" | ||
|
||
def __init__(self, mode: str = "t2s"): | ||
""" | ||
Initialization method. | ||
:param mode: Choose the mode to convert Chinese, | ||
s2t: Simplified Chinese to Traditional Chinese, | ||
t2s: Traditional Chinese to Simplified Chinese, | ||
s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard), | ||
tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese, | ||
s2hk: Simplified Chinese to Traditional Chinese (Hong Kong variant), | ||
hk2s: Traditional Chinese (Hong Kong variant) to Simplified Chinese, | ||
s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) | ||
with Taiwanese idiom, | ||
tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese | ||
with Mainland Chinese idiom, | ||
t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard), | ||
tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese, | ||
hk2t: Traditional Chinese (Hong Kong variant) to Traditional Chinese, | ||
t2hk: Traditional Chinese to Traditional Chinese (Hong Kong variant), | ||
t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese Kanji, | ||
jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese Characters, | ||
""" | ||
mode_list = [ | ||
"s2t", | ||
"t2s", | ||
"s2tw", | ||
"tw2s", | ||
"s2hk", | ||
"hk2s", | ||
"s2twp", | ||
"tw2sp", | ||
"t2tw", | ||
"tw2t", | ||
"hk2t", | ||
"t2hk", | ||
"t2jp", | ||
"jp2t", | ||
] | ||
assert mode in mode_list, "Please make sure mode is one of {}".format(mode_list) | ||
|
||
self._opencc_convert = opencc.OpenCC(mode) | ||
|
||
def process(self, text): | ||
clean_text = self._opencc_convert.convert(text) | ||
return clean_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Copyright 2024 KubeAGI. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import re | ||
|
||
|
||
class CleanBankCard: | ||
"""Mapper to clean bank card in text.""" | ||
|
||
def __init__(self, pattern: str = None, repl: str = ""): | ||
if pattern is None: | ||
self._pattern = r"\b([1-9]{1})(\d{15}|\d{18})(?![0-9])" | ||
else: | ||
self._pattern = pattern | ||
|
||
self._repl = repl | ||
|
||
def process(self, text): | ||
if not re.search(self._pattern, text, flags=re.DOTALL): | ||
return text | ||
|
||
clean_text = re.sub( | ||
pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL | ||
) | ||
return clean_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# Copyright 2024 KubeAGI. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import re | ||
|
||
|
||
class CleanEmail: | ||
"""Mapper to clean email in text.""" | ||
|
||
def __init__(self, pattern: str = None, repl: str = ""): | ||
if pattern is None: | ||
self._pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}" | ||
else: | ||
self._pattern = pattern | ||
|
||
self._repl = repl | ||
|
||
def process(self, text): | ||
if not re.search(self._pattern, text, flags=re.DOTALL): | ||
return text | ||
|
||
clean_text = re.sub( | ||
pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL | ||
) | ||
return clean_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# Copyright 2024 KubeAGI. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import re | ||
|
||
from kubeagi_core.ops.common import special_characters | ||
|
||
|
||
class CleanEmoji: | ||
"""Mapper to clean emojis in text.""" | ||
|
||
def __init__(self, pattern: str = None, repl: str = ""): | ||
if pattern is None: | ||
emojis = special_characters.EMOJI | ||
self._pattern = "|".join(re.escape(value) for value in emojis) | ||
else: | ||
self._pattern = pattern | ||
|
||
self._repl = repl | ||
|
||
def process(self, text): | ||
if not re.search(self._pattern, text, flags=re.DOTALL): | ||
return text | ||
|
||
clean_text = re.sub( | ||
pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL | ||
) | ||
return clean_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Copyright 2024 KubeAGI. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
from selectolax.parser import HTMLParser | ||
|
||
|
||
class CleanHtml: | ||
"""Mapper to clean html in text.""" | ||
|
||
def process(self, text): | ||
text = text.replace("<li>", "\n*") | ||
text = text.replace("</li>", "") | ||
text = text.replace("<ol>", "\n*") | ||
text = text.replace("</ol>", "") | ||
parser = HTMLParser(text) | ||
|
||
clean_text = parser.text() | ||
return clean_text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Copyright 2024 KubeAGI. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import re | ||
|
||
|
||
class CleanIdCard: | ||
"""Mapper to clean id card in text.""" | ||
|
||
def __init__(self, repl: str = ""): | ||
self._pattern = [ | ||
r"\b([1-9]\d{5}[1-9]\d{3})((0\d)|(1[0-2]))(([0|1|2]\d)|(3[0-1]))(\d{3}[0-9Xx])(?![0-9])", | ||
r"\b([1-9]\d{7})((0\d)|(1[0-2]))(([0-2][1-9])|(3[0-1]))(\d{2}[0-9Xx])(?![0-9])", | ||
] | ||
self._repl = repl | ||
|
||
def process(self, text): | ||
for regex_exp in self._pattern: | ||
text = re.sub( | ||
pattern=regex_exp, repl=self._repl, string=text, flags=re.DOTALL | ||
) | ||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# Copyright 2024 KubeAGI. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
|
||
import re | ||
|
||
|
||
class CleanIp: | ||
"""Mapper to clean ipv4 and ipv6 address in text.""" | ||
|
||
def __init__(self, pattern: str = None, repl: str = ""): | ||
if pattern is None: | ||
self._pattern = "".join( | ||
[ | ||
r"((?:(?:1[0-9][0-9]\.)|(?:2[0-4][0-9]\.)|", | ||
r"(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))", | ||
r"{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|", | ||
r"(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|", | ||
r"([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4})", | ||
] | ||
) | ||
else: | ||
self._pattern = pattern | ||
|
||
self._repl = repl | ||
|
||
def process(self, text): | ||
if not re.search(self._pattern, text, flags=re.DOTALL): | ||
return text | ||
|
||
clean_text = re.sub( | ||
pattern=self._pattern, repl=self._repl, string=text, flags=re.DOTALL | ||
) | ||
return clean_text |
Oops, something went wrong.