Skip to content
This repository has been archived by the owner on Jun 25, 2023. It is now read-only.

Electriclizard trt solution #32

Open
wants to merge 35 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
b10375a
infrastructure layer
May 13, 2023
558714a
service layer
May 13, 2023
5923b45
handlers, configuration and transport app layer
May 13, 2023
a9f6005
Containerization
May 13, 2023
a9659ca
helm chart updates
May 13, 2023
3bae4aa
entrypoint fix
May 13, 2023
06e3469
Merge pull request #1 from electriclizard/dev
electriclizard May 15, 2023
8b893f4
Merge pull request #13 from electriclizard/dev
electriclizard May 15, 2023
2cd6166
make models to work with batch
May 16, 2023
ba96614
async queus for model calls
May 16, 2023
e5167f6
Merge pull request #18 from electriclizard/async
electriclizard May 16, 2023
ae6a534
Merge pull request #2 from electriclizard/async
electriclizard May 20, 2023
e13fdea
Merge pull request #3 from electriclizard/dev
electriclizard May 20, 2023
523b8ff
split model and tokenizer
May 21, 2023
9f2b047
create one more queue for bacth tokenise task
May 21, 2023
614d775
split tokenization
May 21, 2023
9f40de7
Merge pull request #4 from electriclizard/splitted-tokenization
electriclizard May 21, 2023
b644a06
Merge pull request #21 from electriclizard/async
electriclizard May 21, 2023
0c9d62d
fix imports
May 30, 2023
d0266a6
remove build_models func
May 30, 2023
71908d7
fix Cuda out of memmory error, add a batch size calculating and batch…
May 30, 2023
b3d0281
fix twice model init and raise batch size
May 30, 2023
d834a64
Merge pull request #26 from electriclizard/splitted-tokenization
electriclizard May 30, 2023
3d56c7b
fix imports
May 30, 2023
7dd3d48
fix twice model init
May 30, 2023
2a44f61
Merge pull request #5 from electriclizard/splitted-tokenization
electriclizard Jun 6, 2023
a795469
onnx runtime for model inference
Jun 6, 2023
3bd9d19
Merge branch 'electriclizard-solution2' into async
electriclizard Jun 6, 2023
1877c36
Merge pull request #29 from electriclizard/async
electriclizard Jun 6, 2023
cb32717
onnx graph optimization
Jun 6, 2023
64df8c8
Merge branch 'electriclizard-solution2' into model-optimization
electriclizard Jun 6, 2023
c4e307b
Merge pull request #6 from electriclizard/async
electriclizard Jun 6, 2023
ccde086
Merge pull request #7 from electriclizard/model-optimization
electriclizard Jun 6, 2023
1377de4
tensorrt inference
Jun 7, 2023
9d01b51
Merge pull request #31 from electriclizard/trt-provider
electriclizard Jun 7, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions autotests/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ global:
activeDeadlineSeconds: 3600 # 1h

env:
PARTICIPANT_NAME: <REPLACE_WITH_USERNAME>
api_host: http://inca-smc-mlops-challenge-solution.default.svc.cluster.local/<REPLACE_WITH_ENDPOINT>
PARTICIPANT_NAME: electriclizard
api_host: http://inca-smc-mlops-challenge-solution.default.svc.cluster.local/process

# K6, do not edit!
K6_PROMETHEUS_RW_SERVER_URL: http://kube-prometheus-stack-prometheus.monitoring.svc.cluster.local:9090/api/v1/write
Expand Down
14 changes: 14 additions & 0 deletions solution/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM ghcr.io/els-rd/transformer-deploy:0.4.0

ARG DEBIAN_FRONTEND=noninteractive

WORKDIR /src
ENV PYTHONPATH="${PYTHONPATH}:${WORKDIR}"

COPY requirements.txt $WORKDIR

RUN pip install -U --no-cache-dir -r requirements.txt

COPY . $WORKDIR

ENTRYPOINT [ "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8080", "--workers", "1" ]
113 changes: 113 additions & 0 deletions solution/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from typing import List
import asyncio

import uvicorn
from fastapi import FastAPI, APIRouter
from fastapi.openapi.docs import get_swagger_ui_html
from fastapi.openapi.utils import get_openapi
from fastapi.responses import HTMLResponse
from starlette.requests import Request

from configs.config import AppConfig, ModelConfig
from infrastructure.models import TrtTransformerTextClassificationModel
from service.recognition import TextClassificationService
from handlers.recognition import PredictionHandler
from handlers.data_models import ResponseSchema


config = AppConfig.parse_file("./configs/app_config.yaml")
models = [
TrtTransformerTextClassificationModel(conf.model, conf.model_path, conf.tokenizer)
for conf in config.models
]

recognition_service = TextClassificationService(models)
recognition_handler = PredictionHandler(recognition_service, config.timeout)

app = FastAPI()
router = APIRouter()


@app.on_event("startup")
def count_max_batch_size():
print("Calculating Max batch size")
app.batch_size = 100

try:
while True:
text = ["this is simple text"]*app.batch_size
inputs = [model.tokenize_texts(text) for model in models]
outputs = [model(m_inputs) for model, m_inputs in zip(models, inputs)]
app.batch_size += 100

except RuntimeError as err:
if "CUDA out of memory" in str(err):
app.batch_size -= 100
print(f"Max batch size calculated = {app.max_batch_size}")


@app.on_event("startup")
def create_queues():
app.models_queues = {}
for md in models:
task_queue = asyncio.Queue()
app.models_queues[md.name] = task_queue
asyncio.create_task(recognition_handler.handle(md.name, task_queue, app.batch_size))


@router.post("/process", response_model=ResponseSchema)
async def process(request: Request):
text = (await request.body()).decode()

results = []
response_q = asyncio.Queue() # init a response queue for every request, one for all models
for model_name, model_queue in app.models_queues.items():
await model_queue.put((text, response_q))
model_res = await response_q.get()
results.append(model_res)
return recognition_handler.serialize_answer(results)


app.include_router(router)


@app.get("/healthcheck")
async def main():
return {"message": "I am alive"}


def custom_openapi():
if app.openapi_schema:
return app.openapi_schema
openapi_schema = get_openapi(
title="NLP Model Service",
version="0.1.0",
description="Inca test task",
routes=app.routes,
)
app.openapi_schema = openapi_schema
return app.openapi_schema


@app.get(
"/documentation/swagger-ui/",
response_class=HTMLResponse,
)
async def swagger_ui_html():
return get_swagger_ui_html(
openapi_url="/documentation/openapi.json",
title="API documentation"
)


@app.get(
"/documentation/openapi.json",
response_model_exclude_unset=True,
response_model_exclude_none=True,
)
async def openapi_endpoint():
return custom_openapi()


if __name__ == "__main__":
uvicorn.run("app:app", host="0.0.0.0", port=config.port, workers=config.workers)
22 changes: 22 additions & 0 deletions solution/configs/app_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
models:
- model: "cardiffnlp"
model_path: "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer: "cardiffnlp/twitter-xlm-roberta-base-sentiment"
- model: "ivanlau"
model_path: "ivanlau/language-detection-fine-tuned-on-xlm-roberta-base"
tokenizer: "ivanlau/language-detection-fine-tuned-on-xlm-roberta-base"
- model: "svalabs"
model_path: "svalabs/twitter-xlm-roberta-crypto-spam"
tokenizer: "svalabs/twitter-xlm-roberta-crypto-spam"
- model: "EIStakovskii"
model_path: "EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus"
tokenizer: "EIStakovskii/xlm_roberta_base_multilingual_toxicity_classifier_plus"
- model: "jy46604790"
model_path: "jy46604790/Fake-News-Bert-Detect"
tokenizer: "jy46604790/Fake-News-Bert-Detect"

port: 8080
workers: 1

timeout: 0.01

20 changes: 20 additions & 0 deletions solution/configs/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from typing import List

from pydantic_yaml import YamlModel


class ModelConfig(YamlModel):
model: str
model_path: str
tokenizer: str


class AppConfig(YamlModel):
# model parameters
models: List[ModelConfig]
# app parameters
port: int
workers: int
# async queues parameters
timeout: float

17 changes: 17 additions & 0 deletions solution/handlers/data_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from typing import List

from pydantic import BaseModel, validator


class RecognitionSchema(BaseModel):
score: float
label: str


class ResponseSchema(BaseModel):
cardiffnlp: RecognitionSchema
ivanlau: RecognitionSchema
svalabs: RecognitionSchema
EIStakovskii: RecognitionSchema
jy46604790: RecognitionSchema

55 changes: 55 additions & 0 deletions solution/handlers/recognition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from typing import List
import asyncio

from pydantic import ValidationError

from infrastructure.models import TextClassificationModelData
from service.recognition import TextClassificationService
from handlers.data_models import ResponseSchema, RecognitionSchema


class PredictionHandler:

def __init__(self, recognition_service: TextClassificationService, timeout: float):
self.recognition_service = recognition_service
self.timeout = timeout

async def handle(self, model_name, model_queue, max_batch_size: int):
while True:
inputs = None
texts = []
queues = []

try:
while True:
(text, response_queue) = await asyncio.wait_for(model_queue.get(), timeout=self.timeout)
texts.append(text)
queues.append(response_queue)
except asyncio.exceptions.TimeoutError:
pass

if texts:
model = next(
(model for model in self.recognition_service.service_models if model.name == model_name),
None
)
if model:
for text_batch in self._perform_batches(texts, max_batch_size):
inputs = model.tokenize_texts(texts)
outs = model(inputs)
for rq, out in zip(queues, outs):
await rq.put(out)

def serialize_answer(self, results: List[TextClassificationModelData]) -> ResponseSchema:
res_model = {rec.model_name: self._recognitions_to_schema(rec) for rec in results}
return ResponseSchema(**res_model)

def _recognitions_to_schema(self, recognition: TextClassificationModelData) -> RecognitionSchema:
if recognition.model_name != "ivanlau":
recognition.label = recognition.label.upper()
return RecognitionSchema(score=recognition.score, label=recognition.label)

def _perform_batches(self, texts: List[str], max_batch_size):
for i in range(0, len(texts), max_batch_size):
yield texts[i:i + max_batch_size]

13 changes: 13 additions & 0 deletions solution/helm/envs/electriclizard.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
global:
# add any variables you need in format `key: value`
# variables will be available in the container as environment variables

# change 8000 to your application target port
pod:
ports:
- name: http
containerPort: 8080
protocol: TCP
service:
targetPort: 8080

107 changes: 107 additions & 0 deletions solution/infrastructure/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from abc import ABC, abstractmethod
from collections.abc import Callable
from dataclasses import dataclass
from typing import List

import torch
import onnxruntime
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from optimum.onnxruntime import ORTModelForSequenceClassification


session_options = onnxruntime.SessionOptions()
session_options.log_severity_level = 1


@dataclass
class TextClassificationModelData:
model_name: str
label: str
score: float


class BaseTextClassificationModel(ABC):

def __init__(self, name: str, model_path: str, tokenizer: str):
self.name = name
self.model_path = model_path
self.tokenizer = tokenizer
self.device = 0 if torch.cuda.is_available() else -1
self._load_model()

@abstractmethod
def _load_model(self):
...

@abstractmethod
def __call__(self, inputs) -> List[TextClassificationModelData]:
...


class TransformerTextClassificationModel(BaseTextClassificationModel):

def _load_model(self):
self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_path)
self.model = self.model.to(self.device)

def tokenize_texts(self, texts: List[str]):
inputs = self.tokenizer.batch_encode_plus(
texts,
add_special_tokens=True,
padding='longest',
truncation=True,
return_token_type_ids=True,
return_tensors='pt'
)
inputs = {k: v.to(self.device) for k, v in inputs.items()} # Move inputs to GPU
return inputs

def _results_from_logits(self, logits: torch.Tensor):
id2label = self.model.config.id2label

label_ids = logits.argmax(dim=1)
scores = logits.softmax(dim=-1)
results = [
{
"label": id2label[label_id.item()],
"score": score[label_id.item()].item()
}
for label_id, score in zip(label_ids, scores)
]
return results

def __call__(self, inputs) -> List[TextClassificationModelData]:
logits = self.model(**inputs).logits
predictions = self._results_from_logits(logits)
predictions = [TextClassificationModelData(self.name, **prediction) for prediction in predictions]
return predictions


class TrtTransformerTextClassificationModel(TransformerTextClassificationModel):

def _load_model(self):
provider_options = {
"trt_engine_cache_enable": True,
"trt_engine_cache_path": f"tmp/{self.name}"
}

self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer)
self.model = ORTModelForSequenceClassification.from_pretrained(
self.model_path,
export=True,
provider="TensorrtExecutionProvider",
provider_options=provider_options,
session_options=session_options,
)

def tokenize_texts(self, texts: List[str]):
inputs = self.tokenizer(
texts,
add_special_tokens=True,
padding=True,
truncation=True,
return_tensors="pt"
).to("cuda")
return inputs

Loading