epam · tias112 · Oct 24, 2023 · Oct 26, 2023 · Oct 26, 2023 · Nov 3, 2023
diff --git a/.env.example b/.env.example
@@ -115,3 +115,14 @@ AIRFLOW_SERVICE_SCHEME=http
 AIRFLOW_SERVICE_HOST=airflow-webserver
 AIRFLOW_SERVICE_PORT=8080
 AIRFLOW_SERVICE_PATH_PREFIX=/api/v1
+
+# Embeddings service config
+EMBED_SERVICE_SCHEME=http
+EMBED_SERVICE_HOST=badgerdoc-embeddings
+EMBED_SERVICE_PORT=8080
+
+# Search Service
+ES_HOST=badgerdoc-elasticsearch
+ES_PORT=9200
+CHATGPT_API_KEY=
+CHATGPT_MODEL=gpt-3.5-turbo
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-build_all:  build_base build_annotation build_users build_convert build_jobs build_keycloak build_assets build_web  build_processing build_pipelines build_models build_taxonomy clean
+build_all:  build_base build_annotation build_users build_convert build_jobs build_keycloak build_assets build_web  build_processing build_pipelines build_models build_taxonomy build_search build_embeddings clean
 
 build_base: 
 	mkdir -p build_dir
@@ -42,5 +42,11 @@ build_models:
 build_taxonomy:
 	docker build --target build taxonomy/ -t badgerdoc_taxonomy
 
+build_search:
+	docker build --target build search/ -t badgerdoc_search
+
+build_embeddings:
+	docker build --target build embeddings/ -t badgerdoc_embeddings
+
 clean:
-	rm -rf build_dir
+	rm -rf build_dir
diff --git a/docker-compose-dev.yaml b/docker-compose-dev.yaml
@@ -12,6 +12,7 @@
 # - 8082 for Keycloak.
 # - 8083 for BadgerDoc web
 # - 9001 for minio
+# - 9200 for elasicsearch
 #
 ###################################################################
 services:
@@ -29,7 +30,8 @@ services:
       --docker.exclude=badgerdoc-keycloak 
       --docker.exclude=badgerdoc-web 
       --docker.exclude=badgerdoc-zookeeper 
-      --docker.exclude=badgerdoc-kafka 
+      --docker.exclude=badgerdoc-kafka
+      --docker.exclude=badgerdoc-elasticsearch      
       --timeout.read-header=900s --timeout.resp-header=900s 
       --max=${MAX_REQ_SIZE}
     volumes:
@@ -218,7 +220,8 @@ services:
     networks:
       - badgerdoc
     depends_on:
-      - badgerdoc-zookeeper
+      badgerdoc-zookeeper:
+        condition: service_started
 
   badgerdoc-pipelines:
     image: badgerdoc_pipelines
@@ -268,6 +271,64 @@ services:
     devices:
       - "/dev/fuse"
 
+  badgerdoc-elasticsearch:
+    container_name: badgerdoc-elasticsearch
+    image: amazon/opendistro-for-elasticsearch:1.13.2
+    environment:
+      - node.name=es01
+      - cluster.name=es-docker-cluster
+      - cluster.initial_master_nodes=es01
+      - bootstrap.memory_lock=true
+      - opendistro_security.disabled=true
+      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
+    ulimits:
+      memlock:
+        soft: -1
+        hard: -1
+      nofile:
+        soft: 65536
+        hard: 65536
+    healthcheck:
+      test: curl --fail http://localhost:9200 || exit 1
+      interval: 30s
+      timeout: 3s
+      retries: 10
+      start_period: 30s
+    networks:
+      - badgerdoc
+    ports:
+      - ${ES_PORT}:${ES_PORT}
+    volumes:
+      - elasticdata:/usr/share/elasticsearch/data
+
+  badgerdoc-embeddings:
+    image: badgerdoc_embeddings
+    container_name: embeddings
+    env_file:
+      - .env
+    environment:
+      - ROOT_PATH=/embeddings
+    networks:
+      - badgerdoc
+
+  badgerdoc-search:
+    image: badgerdoc_search
+    container_name: search
+    env_file:
+      - .env
+    environment:
+      - ROOT_PATH=/search
+    working_dir: /opt/search
+    networks:
+      - badgerdoc
+    command: bash -c "./wait-for-it.sh -t 0 badgerdoc-kafka:9092 && uvicorn search.main:app --host 0.0.0.0 --port 8080 --reload"
+    depends_on:
+      - badgerdoc-kafka
+      - badgerdoc-elasticsearch
+      - badgerdoc-embeddings
+    volumes:
+      - ./search/:/opt/search
+
   # TBD: Gotenberg
 
 networks:
@@ -277,3 +338,4 @@ networks:
 volumes:
   badgerdoc-data: {}
   minio-data: {}
+  elasticdata: {}
diff --git a/embeddings/Dockerfile b/embeddings/Dockerfile
@@ -0,0 +1,18 @@
+ARG base_image=818863528939.dkr.ecr.eu-central-1.amazonaws.com/badgerdoc/python_base:0.1.7
+FROM ${base_image} as build
+
+# Installing necessary packages
+RUN pip install tensorflow tensorflow_hub fastapi numpy uvicorn python-dotenv
+WORKDIR /embeddings
+# Downloading the TensorFlow embedding model
+RUN python -c "import tensorflow_hub as hub; hub.load('https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/qa/versions/2')"
+RUN python -c "import tensorflow_hub as hub; hub.load('https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2')"
+
+COPY embeddings embeddings
+COPY .env ./
+#COPY main.py /app/
+#WORKDIR /app
+CMD ["uvicorn", "embeddings.main:app", "--host", "0.0.0.0", "--port", "8080"]
+EXPOSE 8080
+
+#CMD ["python", "main.py"]
diff --git a/embeddings/embeddings/config.py b/embeddings/embeddings/config.py
@@ -0,0 +1,14 @@
+from dotenv import find_dotenv
+from pydantic import BaseSettings
+
+
+class Settings(BaseSettings):
+    app_title: str
+    root_path: str = ""
+
+    class Config:
+        env_file: str = find_dotenv(".env")
+        env_file_encoding = "utf-8"
+
+
+settings = Settings()
diff --git a/embeddings/embeddings/main.py b/embeddings/embeddings/main.py
@@ -0,0 +1,95 @@
+import os
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi import (FastAPI,
+                     HTTPException,
+                     status)
+import tensorflow_hub as hub
+
+from typing import Union
+import numpy as np
+import tensorflow as tf
+import embeddings.schemas as schemas
+from embeddings.config import settings
+
+app = FastAPI(
+    title=settings.app_title,
+    root_path=settings.root_path,
+    dependencies=[],
+)
+
+if WEB_CORS := os.getenv("WEB_CORS", ""):
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=WEB_CORS.split(","),
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+
+embed_qa = hub.load(
+    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/qa/versions/2")
+embed = hub.load(
+    "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2")
+# bert_model_txt = SentenceTransformer("all-MiniLM-L6-v2")
+print("module loaded")
+
+
+@app.post('/api/embed/sent',
+          tags=["Embeddings"],
+          summary="USE embeddings",
+          response_model=schemas.EmbedResultSchema
+          )
+def text_use(
+        request: schemas.EmbedRequest
+) -> Union[schemas.EmbedResultSchema, HTTPException]:
+    texts = request.instances
+    if not texts:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND, detail="invalid parameters"
+        )
+    return schemas.EmbedResultSchema(predictions=np.array(embed(texts)).tolist())
+
+
+@app.get('/api/embed/question',
+         tags=["Embeddings"],
+         summary="USE embeddings for Question",
+         response_model=schemas.EmbedQuestionResultSchema
+         )
+def text_question(
+        question: str
+) -> Union[schemas.EmbedQuestionResultSchema, HTTPException]:
+    if not question:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND, detail="invalid parameters"
+        )
+    query_embedding = embed_qa.signatures['question_encoder'](tf.constant([question]))['outputs'][0]
+    return schemas.EmbedQuestionResultSchema(predictions=np.array(query_embedding).tolist())
+
+
+@app.post('/api/embed/responses',
+          tags=["Embeddings"],
+          summary="USE embeddings for Context Sentences",
+          response_model=schemas.EmbedResponseAnswerResultSchema
+          )
+def text_response(request: schemas.EmbedResponseContextRequest) -> Union[
+    schemas.EmbedResponseAnswerResultSchema, HTTPException]:
+    responses = request.responses
+    response_batch = [r.sentence for r in responses]
+    context_batch = [c.context for c in responses]
+    if not responses:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND, detail="invalid parameters"
+        )
+    encodings = embed_qa.signatures['response_encoder'](
+        input=tf.constant(response_batch),
+        context=tf.constant(context_batch)
+    )
+    ret = []
+    for batch_index, batch in enumerate(response_batch):
+        ret.append(
+            schemas.ResponseVector(sentence=batch, encodings=np.array(encodings['outputs'][batch_index]).tolist()));
+
+    return schemas.EmbedResponseAnswerResultSchema(embedings=ret)
+
+
+if __name__ == '__main__':
+    app.run('0.0.0.0', 8080, debug=False)
diff --git a/embeddings/embeddings/schemas.py b/embeddings/embeddings/schemas.py
@@ -0,0 +1,41 @@
+from typing import List, Optional
+
+from pydantic import BaseModel, Field
+import pydantic
+
+
+class EmbedRequest(BaseModel):
+    instances: Optional[List[str]] = Field(description="list of sentences")
+
+
+class EmbedResultSchema(BaseModel):
+    predictions: List[pydantic.conlist(float, min_items=512, max_items=512)] = Field(
+       description="an array of embedding vectors"
+    )
+
+class EmbedQuestionResultSchema(BaseModel):
+    predictions: pydantic.conlist(float, min_items=512, max_items=512) = Field(
+       description="embedding vector for question. dimension = 512"
+    )
+
+class ResponseContext(BaseModel):
+    sentence: str = Field(description="sentence text")
+    context: str = Field(description="context text")
+
+class EmbedResponseContextRequest(BaseModel):
+    responses: List[ResponseContext] = Field(
+       description="an array of sentences and their text context"
+    )
+
+class ResponseVector(BaseModel):
+    sentence: str = Field(description="context sentence")
+    encodings: pydantic.conlist(float, min_items=512, max_items=512) = Field(
+       description="embedding vector for context sentence. dimension = 512"
+    )
+
+class EmbedResponseAnswerResultSchema(BaseModel):
+    embedings: List[ResponseVector] = Field(
+       description="an array of embedding vectors"
+    )
+
+
diff --git a/search/.env b/search/.env
@@ -1,25 +1,27 @@
 APP_TITLE=Badgerdoc Search
-ANNOTATION_URL=http://annotation
 ANNOTATION_CATEGORIES=/categories
 ANNOTATION_CATEGORIES_SEARCH=/categories/search
 MANIFEST=manifest.json
 TEXT_PIECES_PATH=/pieces
 INDEXATION_PATH=/indexation
-JOBS_URL=http://jobs
+JOBS_SERVICE_HOST=badgerdoc-jobs
 JOBS_SEARCH=/jobs/search
 COMPUTED_FIELDS=["job_id", "category"]
-
-KEYCLOAK_URL=http://bagerdoc-keycloack
+EMBED_SENT_PATH=/api/embed/sent
+EMBED_RESPONSES_PATH=/api/embed/responses
+EMBED_QUESTION_PATH=/api/embed/question
+KEYCLOAK_HOST="http://bagerdoc-keycloack"
 JWT_ALGORITHM=RS256
+CHATGPT_API_KEY=
+CHATGPT_MODEL=gpt-3.5-turbo
+TEXT_CATEGORY=3
 
 ES_HOST=elasticsearch
 ES_PORT=9200
 
-KAFKA_BOOTSTRAP_SERVER=kafka:9092
-KAFKA_GROUP_ID=search_group
+KAFKA_BOOTSTRAP_SERVER=badgerdoc-kafka:9092
 KAFKA_SEARCH_TOPIC=search
-KAFKA_SEARCH_TOPIC_PARTITIONS=50
-KAFKA_SEARCH_REPLICATION_FACTOR=1
+KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1
 
 ES_HOST_TEST=localhost
 ES_PORT_TEST=9200
@@ -28,6 +30,7 @@ S3_ENDPOINT_URL=http://minio
 S3_LOGIN=minioadmin
 S3_PASS=minioadmin
 S3_START_PATH=annotation
+S3_TEXT_PATH=files
 S3_CREDENTIALS_PROVIDER=minio
 S3_PREFIX=
 

diff --git a/search/Dockerfile b/search/Dockerfile
@@ -1,5 +1,5 @@
 ARG base_image=818863528939.dkr.ecr.eu-central-1.amazonaws.com/badgerdoc/python_base:0.1.7
-FROM ${base_image} as base
+FROM ${base_image} as build
 
 ENV PYTHONPATH /opt/search
 WORKDIR /opt/search
@@ -13,11 +13,10 @@ RUN : \
     && poetry config virtualenvs.create false \
     && :
 
-FROM base as build
-
 COPY documentation documentation
 RUN poetry install --no-root --only main
 CMD ["uvicorn", "search.main:app", "--host", "0.0.0.0", "--port", "8080"]
+EXPOSE 8080
 
 FROM base as test
 
@@ -43,4 +42,4 @@ FROM base as build-dev
 
 RUN poetry install --no-root
 COPY documentation documentation
-CMD ["uvicorn", "search.main:app", "--host", "0.0.0.0", "--port", "8080", "--reload"]
+#CMD ["uvicorn", "search.main:app", "--host", "0.0.0.0", "--port", "8080", "--reload"]