Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: semantic search on documents #812

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,14 @@ AIRFLOW_SERVICE_SCHEME=http
AIRFLOW_SERVICE_HOST=airflow-webserver
AIRFLOW_SERVICE_PORT=8080
AIRFLOW_SERVICE_PATH_PREFIX=/api/v1

# Embeddings service config
EMBED_SERVICE_SCHEME=http
EMBED_SERVICE_HOST=badgerdoc-embeddings
EMBED_SERVICE_PORT=8080

# Search Service
ES_HOST=badgerdoc-elasticsearch
ES_PORT=9200
CHATGPT_API_KEY=
CHATGPT_MODEL=gpt-3.5-turbo
10 changes: 8 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
build_all: build_base build_annotation build_users build_convert build_jobs build_keycloak build_assets build_web build_processing build_pipelines build_models build_taxonomy clean
build_all: build_base build_annotation build_users build_convert build_jobs build_keycloak build_assets build_web build_processing build_pipelines build_models build_taxonomy build_search build_embeddings clean

build_base:
mkdir -p build_dir
Expand Down Expand Up @@ -42,5 +42,11 @@ build_models:
build_taxonomy:
docker build --target build taxonomy/ -t badgerdoc_taxonomy

build_search:
docker build --target build search/ -t badgerdoc_search

build_embeddings:
docker build --target build embeddings/ -t badgerdoc_embeddings

clean:
rm -rf build_dir
rm -rf build_dir
66 changes: 64 additions & 2 deletions docker-compose-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# - 8082 for Keycloak.
# - 8083 for BadgerDoc web
# - 9001 for minio
# - 9200 for elasicsearch
#
###################################################################
services:
Expand All @@ -29,7 +30,8 @@ services:
--docker.exclude=badgerdoc-keycloak
--docker.exclude=badgerdoc-web
--docker.exclude=badgerdoc-zookeeper
--docker.exclude=badgerdoc-kafka
--docker.exclude=badgerdoc-kafka
--docker.exclude=badgerdoc-elasticsearch
--timeout.read-header=900s --timeout.resp-header=900s
--max=${MAX_REQ_SIZE}
volumes:
Expand Down Expand Up @@ -218,7 +220,8 @@ services:
networks:
- badgerdoc
depends_on:
- badgerdoc-zookeeper
badgerdoc-zookeeper:
condition: service_started

badgerdoc-pipelines:
image: badgerdoc_pipelines
Expand Down Expand Up @@ -268,6 +271,64 @@ services:
devices:
- "/dev/fuse"

badgerdoc-elasticsearch:
container_name: badgerdoc-elasticsearch
image: amazon/opendistro-for-elasticsearch:1.13.2
environment:
- node.name=es01
- cluster.name=es-docker-cluster
- cluster.initial_master_nodes=es01
- bootstrap.memory_lock=true
- opendistro_security.disabled=true
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 65536
hard: 65536
healthcheck:
test: curl --fail http://localhost:9200 || exit 1
interval: 30s
timeout: 3s
retries: 10
start_period: 30s
networks:
- badgerdoc
ports:
- ${ES_PORT}:${ES_PORT}
volumes:
- elasticdata:/usr/share/elasticsearch/data

badgerdoc-embeddings:
image: badgerdoc_embeddings
container_name: embeddings
env_file:
- .env
environment:
- ROOT_PATH=/embeddings
networks:
- badgerdoc

badgerdoc-search:
image: badgerdoc_search
container_name: search
env_file:
- .env
environment:
- ROOT_PATH=/search
working_dir: /opt/search
networks:
- badgerdoc
command: bash -c "./wait-for-it.sh -t 0 badgerdoc-kafka:9092 && uvicorn search.main:app --host 0.0.0.0 --port 8080 --reload"
depends_on:
- badgerdoc-kafka
- badgerdoc-elasticsearch
- badgerdoc-embeddings
volumes:
- ./search/:/opt/search

# TBD: Gotenberg

networks:
Expand All @@ -277,3 +338,4 @@ networks:
volumes:
badgerdoc-data: {}
minio-data: {}
elasticdata: {}
18 changes: 18 additions & 0 deletions embeddings/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
ARG base_image=818863528939.dkr.ecr.eu-central-1.amazonaws.com/badgerdoc/python_base:0.1.7
FROM ${base_image} as build

# Installing necessary packages
RUN pip install tensorflow tensorflow_hub fastapi numpy uvicorn python-dotenv
WORKDIR /embeddings
# Downloading the TensorFlow embedding model
RUN python -c "import tensorflow_hub as hub; hub.load('https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/qa/versions/2')"
RUN python -c "import tensorflow_hub as hub; hub.load('https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2')"

COPY embeddings embeddings
COPY .env ./
#COPY main.py /app/
#WORKDIR /app
CMD ["uvicorn", "embeddings.main:app", "--host", "0.0.0.0", "--port", "8080"]
EXPOSE 8080

#CMD ["python", "main.py"]
14 changes: 14 additions & 0 deletions embeddings/embeddings/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from dotenv import find_dotenv
from pydantic import BaseSettings


class Settings(BaseSettings):
app_title: str
root_path: str = ""

class Config:
env_file: str = find_dotenv(".env")
env_file_encoding = "utf-8"


settings = Settings()
95 changes: 95 additions & 0 deletions embeddings/embeddings/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os
from fastapi.middleware.cors import CORSMiddleware
from fastapi import (FastAPI,
HTTPException,
status)
import tensorflow_hub as hub

from typing import Union
import numpy as np
import tensorflow as tf
import embeddings.schemas as schemas
from embeddings.config import settings

app = FastAPI(
title=settings.app_title,
root_path=settings.root_path,
dependencies=[],
)

if WEB_CORS := os.getenv("WEB_CORS", ""):
app.add_middleware(
CORSMiddleware,
allow_origins=WEB_CORS.split(","),
allow_methods=["*"],
allow_headers=["*"],
)

embed_qa = hub.load(
"https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/qa/versions/2")
embed = hub.load(
"https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2")
# bert_model_txt = SentenceTransformer("all-MiniLM-L6-v2")
print("module loaded")


@app.post('/api/embed/sent',
tags=["Embeddings"],
summary="USE embeddings",
response_model=schemas.EmbedResultSchema
)
def text_use(
request: schemas.EmbedRequest
) -> Union[schemas.EmbedResultSchema, HTTPException]:
texts = request.instances
if not texts:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="invalid parameters"
)
return schemas.EmbedResultSchema(predictions=np.array(embed(texts)).tolist())


@app.get('/api/embed/question',
tags=["Embeddings"],
summary="USE embeddings for Question",
response_model=schemas.EmbedQuestionResultSchema
)
def text_question(
question: str
) -> Union[schemas.EmbedQuestionResultSchema, HTTPException]:
if not question:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="invalid parameters"
)
query_embedding = embed_qa.signatures['question_encoder'](tf.constant([question]))['outputs'][0]
return schemas.EmbedQuestionResultSchema(predictions=np.array(query_embedding).tolist())


@app.post('/api/embed/responses',
tags=["Embeddings"],
summary="USE embeddings for Context Sentences",
response_model=schemas.EmbedResponseAnswerResultSchema
)
def text_response(request: schemas.EmbedResponseContextRequest) -> Union[
schemas.EmbedResponseAnswerResultSchema, HTTPException]:
responses = request.responses
response_batch = [r.sentence for r in responses]
context_batch = [c.context for c in responses]
if not responses:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="invalid parameters"
)
encodings = embed_qa.signatures['response_encoder'](
input=tf.constant(response_batch),
context=tf.constant(context_batch)
)
ret = []
for batch_index, batch in enumerate(response_batch):
ret.append(
schemas.ResponseVector(sentence=batch, encodings=np.array(encodings['outputs'][batch_index]).tolist()));

return schemas.EmbedResponseAnswerResultSchema(embedings=ret)


if __name__ == '__main__':
app.run('0.0.0.0', 8080, debug=False)
41 changes: 41 additions & 0 deletions embeddings/embeddings/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from typing import List, Optional

from pydantic import BaseModel, Field
import pydantic


class EmbedRequest(BaseModel):
instances: Optional[List[str]] = Field(description="list of sentences")


class EmbedResultSchema(BaseModel):
predictions: List[pydantic.conlist(float, min_items=512, max_items=512)] = Field(
description="an array of embedding vectors"
)

class EmbedQuestionResultSchema(BaseModel):
predictions: pydantic.conlist(float, min_items=512, max_items=512) = Field(
description="embedding vector for question. dimension = 512"
)

class ResponseContext(BaseModel):
sentence: str = Field(description="sentence text")
context: str = Field(description="context text")

class EmbedResponseContextRequest(BaseModel):
responses: List[ResponseContext] = Field(
description="an array of sentences and their text context"
)

class ResponseVector(BaseModel):
sentence: str = Field(description="context sentence")
encodings: pydantic.conlist(float, min_items=512, max_items=512) = Field(
description="embedding vector for context sentence. dimension = 512"
)

class EmbedResponseAnswerResultSchema(BaseModel):
embedings: List[ResponseVector] = Field(
description="an array of embedding vectors"
)


19 changes: 11 additions & 8 deletions search/.env
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
APP_TITLE=Badgerdoc Search
ANNOTATION_URL=http://annotation
ANNOTATION_CATEGORIES=/categories
ANNOTATION_CATEGORIES_SEARCH=/categories/search
MANIFEST=manifest.json
TEXT_PIECES_PATH=/pieces
INDEXATION_PATH=/indexation
JOBS_URL=http://jobs
JOBS_SERVICE_HOST=badgerdoc-jobs
JOBS_SEARCH=/jobs/search
COMPUTED_FIELDS=["job_id", "category"]

KEYCLOAK_URL=http://bagerdoc-keycloack
EMBED_SENT_PATH=/api/embed/sent
EMBED_RESPONSES_PATH=/api/embed/responses
EMBED_QUESTION_PATH=/api/embed/question
KEYCLOAK_HOST="http://bagerdoc-keycloack"
JWT_ALGORITHM=RS256
CHATGPT_API_KEY=
CHATGPT_MODEL=gpt-3.5-turbo
TEXT_CATEGORY=3

ES_HOST=elasticsearch
ES_PORT=9200

KAFKA_BOOTSTRAP_SERVER=kafka:9092
KAFKA_GROUP_ID=search_group
KAFKA_BOOTSTRAP_SERVER=badgerdoc-kafka:9092
KAFKA_SEARCH_TOPIC=search
KAFKA_SEARCH_TOPIC_PARTITIONS=50
KAFKA_SEARCH_REPLICATION_FACTOR=1
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1

ES_HOST_TEST=localhost
ES_PORT_TEST=9200
Expand All @@ -28,6 +30,7 @@ S3_ENDPOINT_URL=http://minio
S3_LOGIN=minioadmin
S3_PASS=minioadmin
S3_START_PATH=annotation
S3_TEXT_PATH=files
S3_CREDENTIALS_PROVIDER=minio
S3_PREFIX=

Expand Down
7 changes: 3 additions & 4 deletions search/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ARG base_image=818863528939.dkr.ecr.eu-central-1.amazonaws.com/badgerdoc/python_base:0.1.7
FROM ${base_image} as base
FROM ${base_image} as build

ENV PYTHONPATH /opt/search
WORKDIR /opt/search
Expand All @@ -13,11 +13,10 @@ RUN : \
&& poetry config virtualenvs.create false \
&& :

FROM base as build

COPY documentation documentation
RUN poetry install --no-root --only main
CMD ["uvicorn", "search.main:app", "--host", "0.0.0.0", "--port", "8080"]
EXPOSE 8080

FROM base as test

Expand All @@ -43,4 +42,4 @@ FROM base as build-dev

RUN poetry install --no-root
COPY documentation documentation
CMD ["uvicorn", "search.main:app", "--host", "0.0.0.0", "--port", "8080", "--reload"]
#CMD ["uvicorn", "search.main:app", "--host", "0.0.0.0", "--port", "8080", "--reload"]
Loading