Skip to content

Commit

Permalink
feat:add library to data process
Browse files Browse the repository at this point in the history
  • Loading branch information
wangxinbiao committed Mar 8, 2024
1 parent d88f6ef commit 789cb01
Show file tree
Hide file tree
Showing 12 changed files with 893 additions and 0 deletions.
44 changes: 44 additions & 0 deletions examples/document_transformers/try_pdf2csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from kubeagi_core.document_transformers.pdf2csv import PDF2CSVTransform


def test_pdf_to_csv():
pdf_transformer = PDF2CSVTransform(
file_path="data/test.pdf",
llm_config={
"model": "6ac7baa2-71e7-4ffc-bd49-9356e743ecbb",
"base_url": "http://fastchat-api.172.22.96.167.nip.io/v1",
"api_key": "fake",
"type": "openai",
"temperature": "0.7",
"max_tokens": "2048",
},
data_clean_config=[
{"type": "chinese_convert"},
{"type": "remove_emojis"},
{"type": "remove_email", "repl": "<EMAIL>"},
{"type": "remove_ip_address", "repl": "<IP>"},
{"type": "remove_phone", "repl": "<PHONE>"},
],
output_dir="data",
)
pdf_transformer.transform()
print("<<< Finished")


if __name__ == "__main__":
test_pdf_to_csv()
138 changes: 138 additions & 0 deletions examples/ops/try_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from kubeagi_core.ops.transform import DataConvert
from kubeagi_core.ops.transform import Clean
from kubeagi_core.ops.transform import FixUnicode


def test_traditional_to_simplified():
print(">>> Starting traditional to simplified")
text = "風暴帶來的暫停使消防員和其他緊急反應人員得以進入禁區進行結構破壞評估。"
data_convert = DataConvert()

clean_text = data_convert.chinese_convert(text)
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_bank_card_convert():
print(">>> Starting convert bank card")
text = "银行卡号1:1234567890123456,银行卡号2:12345678901234567,银行卡号3:1234567890123456789"
data_convert = DataConvert()

clean_text = data_convert.bank_card_convert(text=text, repl="xxxx")
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_email_convert():
print(">>> Starting convert email")
text = "如果需要可以联系官方邮箱:172817631@qq.com马上申请为你开通"
data_convert = DataConvert()

clean_text = data_convert.email_convert(text=text, repl="xxxx")
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_emoji_convert():
print(">>> Starting convert emoji")
text = "这是一段带有表情符号😊的文本。"
data_convert = DataConvert()

clean_text = data_convert.emojis_convert(text=text, repl="xxxx")
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_clean_html():
print(">>> Starting convert emoji")
text = "<div class='center'><span class='bolded'>学员成绩单分析报告"
clean = Clean()

clean_text = clean.clean_html(text=text)
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_id_card_convert():
print(">>> Starting convert id card")
text = "身份证号1:123451230112121234, 身份证号2:12345123011212123x"
data_convert = DataConvert()

clean_text = data_convert.id_card_convert(text=text, repl="xxxx")
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_ip_convert():
print(">>> Starting convert ip")
text = "服务器登陆ip为192.168.255.255"
data_convert = DataConvert()

clean_text = data_convert.ip_convert(text=text, repl="xxxx")
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_phone_convert():
print(">>> Starting convert phone")
text = "手机号为18672615192"
data_convert = DataConvert()

clean_text = data_convert.phone_convert(text=text, repl="xxxx")
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_weixin_convert():
print(">>> Starting convert weixin")
text = "你的wx:qw123"
data_convert = DataConvert()

clean_text = data_convert.weixin_convert(text=text, repl="xxxx")
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_fix_unicode():
print(">>> Starting fix unicode")
text = "法律工作者。 — like this one."
fix_unicode = FixUnicode()

clean_text = fix_unicode.process(text=text)
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_invisible_characterse_convert():
print(">>> Starting convert invisible characterse")
text = "一户一表、水表出户、抄表到户"
data_convert = DataConvert()

clean_text = data_convert.invisible_characters_convert(text=text, repl="")
print("<<< Finished")
print(f"clean text: {clean_text}")


def test_space_convert():
print(">>> Starting convert space")
text = "41 行业大模型标准体系及能力架构研究报告行业大模型“千行百业”落地"
data_convert = DataConvert()

clean_text = data_convert.space_convert(text=text, repl=" ")
print("<<< Finished")
print(f"clean text: {clean_text}")
6 changes: 6 additions & 0 deletions libs/cli/kubeagi_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,17 @@
import typer
from typing_extensions import Annotated
from kubeagi_core.evaluation.ragas_eval import RagasEval
from . import convert


__version__ = "0.0.1"

app = typer.Typer(no_args_is_help=True, add_completion=False)
app.add_typer(
convert.convert_cli,
name="convert",
help=convert.__doc__,
)


@app.command()
Expand Down
74 changes: 74 additions & 0 deletions libs/cli/kubeagi_cli/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright 2024 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import typer
import ujson

from typing import List
from typing_extensions import Annotated

from kubeagi_core.document_transformers.pdf2csv import PDF2CSVTransform


convert_cli = typer.Typer(no_args_is_help=True, add_completion=False)


@convert_cli.command()
def pdf(
file_path: Annotated[
str,
typer.Argument(
help="file path",
),
] = None,
llm_config: Annotated[
str,
typer.Argument(
help="llm config for generate qa",
),
] = None,
data_cleaning_config: Annotated[
List[str],
typer.Option(help="data cleaning config"),
] = [],
output_dir: Annotated[
str,
typer.Option(help="file output path"),
] = None,
chunk_size: Annotated[
int,
typer.Option(help="text chunk size"),
] = 500,
chunk_overlap: Annotated[
int,
typer.Option(help="text chunk overlap"),
] = 50,
):
"""
pdf transformer csv.
"""
if len(data_cleaning_config) > 0:
data_cleaning_config = [ujson.loads(s) for s in data_cleaning_config]

pdf_transformer = PDF2CSVTransform(
file_path=file_path,
llm_config=ujson.loads(llm_config),
data_cleaning_config=data_cleaning_config,
output_dir=output_dir,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)

return pdf_transformer.transform()
1 change: 1 addition & 0 deletions libs/core/kubeagi_core/document_transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Loading

0 comments on commit 789cb01

Please sign in to comment.