diff --git a/pypi/data-processing/src/common/config.py b/pypi/data-processing/src/common/config.py index 2f8675577..2b5fcc99f 100644 --- a/pypi/data-processing/src/common/config.py +++ b/pypi/data-processing/src/common/config.py @@ -68,11 +68,16 @@ def __set_property_value(self): self.llm_qa_retry_count = int(llm_qa_retry_count) - # knowledge + # dataprocess + dataprocess = model_cr.get_dataprocess_in_k8s_configmap( + namespace=k8s_pod_namespace, config_map_name=k8s_default_config + ) # chunk size - self.knowledge_chunk_size = 500 + self.knowledge_chunk_size = dataprocess.get("chunkSize", 500) # chunk overlap self.knowledge_chunk_overlap = 50 + # worker + self.worker = dataprocess.get("worker", 1) # backend PostgreSQL postgresql_config = postgresql_cr.get_postgresql_config_in_k8s_configmap( diff --git a/pypi/data-processing/src/document_chunks/spacy_text_splitter.py b/pypi/data-processing/src/document_chunks/spacy_text_splitter.py index df05fe41b..a0e74ca45 100644 --- a/pypi/data-processing/src/document_chunks/spacy_text_splitter.py +++ b/pypi/data-processing/src/document_chunks/spacy_text_splitter.py @@ -24,10 +24,15 @@ def __init__( self, separator: str = "\n\n", pipeline: str = "zh_core_web_sm", - chunk_size: int = 500, - chunk_overlap: int = 10, + chunk_size: int = None, + chunk_overlap: int = None, ): """Initialize the spacy text splitter.""" + if chunk_size is None: + chunk_size = config.knowledge_chunk_size + if chunk_overlap is None: + chunk_overlap = config.knowledge_chunk_overlap + if chunk_overlap > chunk_size: raise ValueError( f"Got a larger chunk overlap ({chunk_overlap}) than chunk size " diff --git a/pypi/data-processing/src/kube/model_cr.py b/pypi/data-processing/src/kube/model_cr.py index 4170ebb8d..7c7a57fad 100644 --- a/pypi/data-processing/src/kube/model_cr.py +++ b/pypi/data-processing/src/kube/model_cr.py @@ -124,4 +124,34 @@ def get_spec_for_embedding_k8s_cr(name, namespace): return {"status": 200, "message": "获取embedding中的provider成功", "data": provider} except Exception as ex: logger.error(str(ex)) - return {"status": 400, "message": "获取embedding中的provider失败", "data": ""} \ No newline at end of file + return {"status": 400, "message": "获取embedding中的provider失败", "data": ""} + +def get_dataprocess_in_k8s_configmap(namespace, config_map_name): + """Get the dataprocess in the configmap. + + namespace: namespace; + config_map_name: config map name + """ + try: + kube = client.KubeEnv() + + config_map = kube.read_namespaced_config_map( + namespace=namespace, name=config_map_name + ) + + config = config_map.data.get("dataprocess") + + json_data = yaml.safe_load(config) + + return json_data + except Exception as ex: + logger.error( + "".join( + [ + f"Can not the dataprocess. The error is: \n", + f"{traceback.format_exc()}\n", + ] + ) + ) + + return None diff --git a/pypi/data-processing/src/server.py b/pypi/data-processing/src/server.py index f084b758d..b59c63013 100644 --- a/pypi/data-processing/src/server.py +++ b/pypi/data-processing/src/server.py @@ -88,4 +88,4 @@ def _create_database_connection(): if __name__ == "__main__": - sanic_app.run(host="0.0.0.0", port=28888, access_log=False, debug=False, workers=2) + sanic_app.run(host="0.0.0.0", port=28888, access_log=False, debug=False, workers=config.worker)