Skip to content

Commit

Permalink
Merge pull request #671 from wangxinbiao/main
Browse files Browse the repository at this point in the history
fix:data processing mirror cannot be started.
  • Loading branch information
bjwswang committed Jan 30, 2024
2 parents 61b34f2 + 811b208 commit 01dd83b
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 30 deletions.
109 changes: 109 additions & 0 deletions docs/images/data-process.drawio
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
<mxfile host="Electron" modified="2023-11-02T10:49:05.695Z" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/21.2.8 Chrome/112.0.5615.165 Electron/24.2.0 Safari/537.36" etag="MtKcBN_l9eNnbYWFwm7D" version="21.2.8" type="device">
<diagram name="第 1 页" id="loeKpyqY9KO9q6GEpTu8">
<mxGraphModel dx="1026" dy="1843" grid="0" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="iNnB15NnGvmIqP9MPX9o-1" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;数据处理HTTP服务&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4472c4;strokeColor=none;fontColor=#ffffff;" parent="1" vertex="1">
<mxGeometry x="9" y="-170" width="810" height="40" as="geometry" />
</mxCell>
<mxCell id="iNnB15NnGvmIqP9MPX9o-23" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#dce6f2;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="120" y="-120" width="700" height="60" as="geometry" />
</mxCell>
<mxCell id="iNnB15NnGvmIqP9MPX9o-24" value="&lt;font color=&quot;#ffffff&quot; style=&quot;font-size: 18px;&quot;&gt;Controller&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4472c4;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="9" y="-120" width="100" height="60" as="geometry" />
</mxCell>
<mxCell id="iNnB15NnGvmIqP9MPX9o-25" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;Service&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4472c4;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="9" y="-55" width="100" height="60" as="geometry" />
</mxCell>
<mxCell id="iNnB15NnGvmIqP9MPX9o-26" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;Handle&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4472c4;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="9" y="10" width="100" height="115" as="geometry" />
</mxCell>
<mxCell id="iNnB15NnGvmIqP9MPX9o-27" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;Transform&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4472c4;fontColor=#FFFFFF;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="9" y="130" width="100" height="60" as="geometry" />
</mxCell>
<mxCell id="iNnB15NnGvmIqP9MPX9o-29" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;基础类&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4472c4;fontColor=#ffffff;strokeColor=none;" parent="1" vertex="1">
<mxGeometry x="9" y="197" width="100" height="60" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-3" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#dce6f2;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="120" y="-55" width="700" height="60" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-5" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;minio_process&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#a20025;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="126" y="-50" width="208" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-6" value="&lt;font color=&quot;#ffffff&quot; style=&quot;font-size: 18px;&quot;&gt;text_clean_for_minio&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#5b9bd5;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="127" y="-116" width="689" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-7" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;database_process&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#a20025;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="339.5" y="-50" width="239" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-8" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;web_api_process&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#a20025;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="584" y="-50" width="229" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-9" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#dce6f2;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="120" y="10" width="700" height="115" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-10" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;json&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#d80073;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="126" y="16" width="135" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-11" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;csv&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#d80073;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="126" y="70" width="135" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-12" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;txt&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#d80073;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="265" y="16" width="135" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-13" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;pdf&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#d80073;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="265" y="70" width="135" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-14" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;doc&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#d80073;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="403" y="16" width="135" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-15" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;markdown&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#d80073;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="403" y="70" width="135" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-16" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;html&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#d80073;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="541" y="16" width="135" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-17" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;ppt&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#d80073;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="541" y="70" width="135" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-18" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;excel&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#d80073;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="679" y="16" width="135" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-19" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#dce6f2;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="120" y="130" width="700" height="60" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-20" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;text&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#008a00;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="126" y="135" width="225" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-21" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;image&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#008a00;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="357.5" y="135" width="225" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-22" value="&lt;span style=&quot;font-size: 18px;&quot;&gt;table&lt;/span&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#008a00;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="589" y="135" width="225" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-23" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#dce6f2;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="120" y="197" width="700" height="60" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-24" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;utils&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#1ba1e2;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="126" y="202" width="225" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-25" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;common&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#1ba1e2;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="358" y="202" width="225" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-26" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;OCR&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#1ba1e2;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="588" y="202" width="225" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-27" value="" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#dce6f2;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="119.5" y="262" width="700" height="60" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-28" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;Sanic&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#6a00ff;fontColor=#ffffff;strokeColor=#3700CC;" vertex="1" parent="1">
<mxGeometry x="126" y="267" width="684" height="50" as="geometry" />
</mxCell>
<mxCell id="6DU0XRBK3AAMFuq2KatI-29" value="&lt;font style=&quot;font-size: 18px;&quot;&gt;Web框架&lt;/font&gt;" style="rounded=0;whiteSpace=wrap;html=1;fillColor=#4472c4;fontColor=#ffffff;strokeColor=none;" vertex="1" parent="1">
<mxGeometry x="9" y="262" width="100" height="60" as="geometry" />
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>
Binary file added docs/images/data-process.drawio.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion pypi/data-processing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ The data processing process includes: cleaning abnormal data, filtering, de-dupl

## Design

![Design](../assets/data_process.drawio.png)
![Design](../../docs/images/data-process.drawio.png)

## Local Development
### Software Requirements
Expand Down
2 changes: 1 addition & 1 deletion pypi/data-processing/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ aiohttp==3.8.6
ulid==1.1
minio==7.1.17
zhipuai==1.0.7
langchain==0.0.336
langchain==0.0.354
spacy==3.5.4
pypdf==3.17.1
emoji==2.2.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from database_clients import postgresql_pool_client
from utils import date_time_utils


def add(
req_json,
pool
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from database_clients import postgresql_pool_client
from utils import date_time_utils


def add(
req_json,
pool
Expand Down
56 changes: 28 additions & 28 deletions pypi/data-processing/src/document_loaders/async_playwright.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,17 @@
# limitations under the License.

import logging
import time
import traceback
from typing import List

from langchain_community.document_loaders.base import BaseLoader
import playwright
from langchain_community.document_transformers import Html2TextTransformer
from langchain_core.documents import Document
from playwright.async_api import async_playwright

from common import log_tag_const
from document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)

Expand All @@ -32,7 +36,7 @@ def __init__(
url: str,
max_count: int = 100,
max_depth: int = 1,
interval_time: int = 1,
interval_time: int = 1000,
):
"""
Initialize the loader with a list of URL paths.
Expand All @@ -46,18 +50,17 @@ def __init__(
Raises:
ImportError: If the required 'playwright' package is not installed.
"""
self.url = url
self.max_count = max_count
self.max_depth = max_depth
self.interval_time = interval_time

try:
import playwright
except ImportError:
raise ImportError(
"playwright is required for AsyncPlaywrightLoader. "
"Please install it with `pip install playwright`."
)
if max_count is None:
max_count = 100
if max_depth is None:
max_depth = 1
if interval_time is None:
interval_time = 1000

self._url = url
self._max_count = max_count
self._max_depth = max_depth
self._interval_time = interval_time / 1000

async def ascrape_playwright(self, url: str) -> str:
"""
Expand All @@ -70,7 +73,6 @@ async def ascrape_playwright(self, url: str) -> str:
str: The scraped HTML content or an error message if an exception occurs.
"""
from playwright.async_api import async_playwright

logger.info("Starting scraping...")
results = ""
Expand Down Expand Up @@ -121,20 +123,18 @@ async def get_all_url(self):
"".join(
[
f"{log_tag_const.WEB_CRAWLING} Get all url in a web page\n",
f" url: {self.url}"
f" url: {self._url}"
]
)
)

all_url = [self.url]
sub_urls = [self.url]

all_url = [self._url]
sub_urls = [self._url]
try:
for i in range(1, self.max_depth):
for i in range(1, self._max_depth):
for sub_url in sub_urls:
children_urls = await self._get_children_url(
url=sub_url,
max_count=self.max_count,
url_count=len(all_url)
)

Expand All @@ -147,21 +147,21 @@ async def get_all_url(self):
all_url = list(unique_urls)

# 如果达到最大数量限制,直接返回
if res.get("url_count") >= self.max_count:
if res.get("url_count") >= self._max_count:
logger.info(
"".join(
[
f"{log_tag_const.WEB_CRAWLING} The number of URLs has reached the upper limit.\n",
f" max_count: {self.max_count}\n"
f" max_count: {self._max_count}\n"
]
)
)
return all_url

sub_urls = res.get("children_url")
# 时间间隔
logger.info(f"{log_tag_const.WEB_CRAWLING} Wait for {self.interval_time} seconds before continuing the visit.")
time.sleep(self.interval_time)
logger.info(f"{log_tag_const.WEB_CRAWLING} Wait for {self._interval_time} seconds before continuing the visit.")
time.sleep(self._interval_time)
return all_url
except Exception:
logger.error(
Expand All @@ -188,7 +188,7 @@ async def _get_children_url(self, url, url_count):
[
f"{log_tag_const.WEB_CRAWLING} Get sub url in a web page\n",
f" url: {url}\n",
f" max_count: {self.max_count}\n",
f" max_count: {self._max_count}\n",
f" url_count: {url_count}"
]
)
Expand All @@ -209,12 +209,12 @@ async def _get_children_url(self, url, url_count):
for link in links:
href = await link.get_attribute('href')
# 需要抓取的url数量不得超过最大数量
if url_count >= self.max_count:
if url_count >= self._max_count:
logger.info(
"".join(
[
f"{log_tag_const.WEB_CRAWLING} The number of URLs has reached the upper limit.\n",
f" max_count: {self.max_count}\n",
f" max_count: {self._max_count}\n",
f" url_count: {url_count}"
]
)
Expand Down
1 change: 1 addition & 0 deletions pypi/data-processing/src/document_loaders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from langchain_core.documents import Document


class BaseLoader(ABC):
"""Interface for Document Loader.
Expand Down

0 comments on commit 01dd83b

Please sign in to comment.