Skip to content

Commit

Permalink
feat:Real-time data access
Browse files Browse the repository at this point in the history
  • Loading branch information
hsp committed Jan 23, 2024
1 parent 93be6a6 commit 70960f7
Show file tree
Hide file tree
Showing 8 changed files with 1,214 additions and 0 deletions.
43 changes: 43 additions & 0 deletions deploy/charts/arcadia/templates/pg-init-data-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,50 @@ data:
COMMENT ON COLUMN public.data_process_task_stage_log.update_datetime IS '更新时间';
COMMENT ON COLUMN public.data_process_task_stage_log.update_user IS '更新人';
COMMENT ON COLUMN public.data_process_task_stage_log.update_program IS '更新程序';
create table if not exists data_process_task_document_web_url
(
id varchar(32) not null
constraint data_process_task_document_web_url_pkey
primary key,
task_id varchar(32),
document_id varchar(32),
level varchar(32),
web_url varchar(4096),
title varchar(1024),
description text,
content text,
content_clean text,
language varchar(32),
status varchar(4),
error_message text,
create_datetime varchar(32),
create_user varchar(32),
create_program varchar(64),
update_datetime varchar(32),
update_user varchar(32),
update_program varchar(32)
);
create table if not exists data_process_task_document_image
(
id varchar(32) not null
constraint data_process_task_document_image_pkey
primary key,
task_id varchar(32),
document_id varchar(512),
url varchar(1024),
image_path varchar(4096),
ocr_content text,
image_info text,
create_datetime varchar(32),
create_user varchar(32),
create_program varchar(64),
update_datetime varchar(32),
update_user varchar(32),
update_program varchar(32),
meta_info text
);
kind: ConfigMap
metadata:
Expand Down
1 change: 1 addition & 0 deletions pypi/data-processing/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ WORKDIR /arcadia_app
RUN chmod 777 /arcadia_app/entrypoint.sh

RUN pip install -r requirements.txt
RUN playwright install && playwright install-deps

ENTRYPOINT ["./entrypoint.sh"]

Expand Down
45 changes: 45 additions & 0 deletions pypi/data-processing/db-scripts/init-database-schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -321,3 +321,48 @@
COMMENT ON COLUMN public.data_process_task_stage_log.update_datetime IS '更新时间';
COMMENT ON COLUMN public.data_process_task_stage_log.update_user IS '更新人';
COMMENT ON COLUMN public.data_process_task_stage_log.update_program IS '更新程序';

create table if not exists data_process_task_document_web_url
(
id varchar(32) not null
constraint data_process_task_document_web_url_pkey
primary key,
task_id varchar(32),
document_id varchar(32),
level varchar(32),
web_url varchar(4096),
title varchar(1024),
description text,
content text,
content_clean text,
language varchar(32),
status varchar(4),
error_message text,
create_datetime varchar(32),
create_user varchar(32),
create_program varchar(64),
update_datetime varchar(32),
update_user varchar(32),
update_program varchar(32)
);

create table if not exists data_process_task_document_image
(
id varchar(32) not null
constraint data_process_task_document_image_pkey
primary key,
task_id varchar(32),
document_id varchar(512),
url varchar(1024),
image_path varchar(4096),
ocr_content text,
image_info text,
create_datetime varchar(32),
create_user varchar(32),
create_program varchar(64),
update_datetime varchar(32),
update_user varchar(32),
update_program varchar(32),
meta_info text
);

4 changes: 4 additions & 0 deletions pypi/data-processing/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,7 @@ opencc-python-reimplemented==0.1.7
selectolax==0.3.17
openai==1.3.7
python-docx==1.1.0

bs4==0.0.1
playwright=1.40.0
pillow==10.2.0
2 changes: 2 additions & 0 deletions pypi/data-processing/src/common/log_tag_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,5 @@
OPEN_AI = "Open AI"

CONFIG = "Config"

WEB_CRAWLING = "Web Url Utils"
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# Copyright 2023 KubeAGI.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from database_clients import postgresql_pool_client
from utils import date_time_utils

def add(
req_json,
pool
):
"""Add a new record"""
now = date_time_utils.now_str()
user = req_json['creator']
program = '数据处理URL-新增'

params = {
'id': req_json['id'],
'document_id': req_json['document_id'],
'task_id': req_json['task_id'],
'url': req_json['url'],
'image_path': req_json['image_path'],
'ocr_content': req_json['ocr_content'],
'image_info': req_json['image_info'],
'meta_info': req_json['meta_info'],
'create_datetime': now,
'create_user': user,
'create_program': program,
'update_datetime': now,
'update_user': user,
'update_program': program
}

sql = """
insert into public.data_process_task_document_image (
id,
document_id,
task_id,
url,
image_path,
ocr_content,
image_info,
meta_info,
create_datetime,
create_user,
create_program,
update_datetime,
update_user,
update_program
)
values (
%(id)s,
%(document_id)s,
%(task_id)s,
%(url)s,
%(image_path)s,
%(ocr_content)s,
%(image_info)s,
%(meta_info)s,
%(create_datetime)s,
%(create_user)s,
%(create_program)s,
%(update_datetime)s,
%(update_user)s,
%(update_program)s
)
""".strip()

res = postgresql_pool_client.execute_update(pool, sql, params)
return res


def update_by_id(
req_json,
pool
):
"""update a new record"""
now = date_time_utils.now_str()
user = req_json['creator']
program = '数据处理URL-更新'

params = {
'id': req_json['id'],
'document_id': req_json['document_id'],
'task_id': req_json['task_id'],
'url': req_json['url'],
'image_path': req_json['image_path'],
'ocr_content': req_json['ocr_content'],
'image_info': req_json['image_info'],
'meta_info': req_json['meta_info'],
'update_datetime': now,
'update_user': user,
'update_program': program
}

sql = """
update public.data_process_task_document_image set
url = %(url)s,
image_path = %(image_path)s,
ocr_content = %(ocr_content)s,
image_info = %(image_info)s,
meta_info = %(meta_info)s
update_datetime = %(update_datetime)s,
update_user = %(update_user)s,
update_program = %(update_program)s
where id = %(id)s
""".strip()

res = postgresql_pool_client.execute_update(pool, sql, params)
return res


def delete_by_id(
req_json,
pool
):
"""delete a record"""
params = {
'id': req_json['id']
}

sql = """
delete from public.data_process_task_document_image
where
id = %(id)s
""".strip()

res = postgresql_pool_client.execute_update(pool, sql, params)
return res


def info_by_id(
req_json,
pool
):
"""info with id"""
params = {
'id': req_json['id']
}

sql = """
select
id,
document_id,
task_id,
url,
image_path,
ocr_content,
image_info,
meta_info,
create_datetime,
create_user,
create_program,
update_datetime,
update_user,
update_program
from
public.data_process_task_document_image
where
id = %(id)s
""".strip()

res = postgresql_pool_client.execute_query(pool, sql, params)
return res


def list_by_count(
req_json,
pool
):
"""Get count for the list url with page"""
params = {
'keyword': '%' + req_json['url'] + '%'
}

sql = """
select
count(*)
from
public.data_process_task_document_image
where
web_url like %(keyword)s
""".strip()

res = postgresql_pool_client.execute_count_query(pool, sql, params)
return res


def list_by_page(
req_json,
pool
):
"""Get the list data for url by page"""
params = {
'keyword': '%' + req_json['url'] + '%',
'pageIndex': int(req_json['pageIndex']),
'pageSize': int(req_json['pageSize'])
}

sql = """
select
id,
document_id,
task_id,
url,
image_path,
ocr_content,
image_info,
meta_info,
create_datetime,
create_user,
create_program,
update_datetime,
update_user,
update_program
from
public.data_process_task_document_image
where
url like %(keyword)s
order by create_datetime desc
limit %(pageSize)s offset %(pageIndex)s
""".strip()

res = postgresql_pool_client.execute_query(pool, sql, params)
return res
Loading

0 comments on commit 70960f7

Please sign in to comment.