Skip to content

Commit

Permalink
Add gradio demo
Browse files Browse the repository at this point in the history
  • Loading branch information
titusz committed Aug 13, 2024
1 parent 0f21b0e commit 71fabc8
Show file tree
Hide file tree
Showing 8 changed files with 1,482 additions and 28 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
${{ runner.os }}-onnx-model-
- name: Install Dependencies
run: poetry install
run: poetry install -E demo

- name: Run Tests
run: poetry run pytest -n auto --cov=iscc_sct --cov-report=xml -v tests
run: poetry run pytest --cov=iscc_sct --cov-report=xml -v tests
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## [0.1.2] - Unreleased
- Encode granular features with base64
- Refactor result format to generic ISCC data model
- Add optional gradio GUI demo

## [0.1.1] - 2024-06-25
- Handle text decoding errors gracefully
Expand Down
153 changes: 153 additions & 0 deletions iscc_sct/demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
"""
Gradio demo showcasing ISCC Semantic Text Code.
The demo features:
- two side by side text inputs.
- One sample text per input (One sample in english and the other a german translation of it)
- One slider to set global bitlength (32-256 bits in steps of 32 with 64 as default)
- One result output per text input
The user can select the samples or write or paste text into the inputs and generate ISCC Semantic
Text Codes for the Texts. Below the result outputs we show the similarity of the two codes.
"""

from loguru import logger as log
import gradio as gr
import iscc_sct as sct


def compute_iscc_code(text1, text2, bit_length):
code1 = sct.gen_text_code_semantic(text1, bits=bit_length)
code2 = sct.gen_text_code_semantic(text2, bits=bit_length)
similarity = compare_codes(code1["iscc"], code2["iscc"], bit_length)
return code1["iscc"], code2["iscc"], similarity


def compare_codes(code_a, code_b, bits):
if all([code_a, code_b]):
return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))


def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
"""Aproximate the cosine similarity for a given hamming distance and dimension"""
result = 1 - (2 * hamming_distance) / dim
return result


def generate_similarity_bar(similarity):
"""Generate a horizontal bar representing the similarity value, scaled to -100% to +100%."""
# Scale similarity from [-1, 1] to [-100, 100]
display_similarity = similarity * 100

# Calculate the width of the bar based on the absolute value of similarity
bar_width = int(abs(similarity) * 50) # 50% is half the width of the container

# Determine the color and starting position based on the sign of the similarity
color = "green" if similarity >= 0 else "red"
position = "left" if similarity >= 0 else "right"

# Adjust the text position to be centered within the colored bar
text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
text_alignment = "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"

bar_html = f"""
<h3>Semantic Similarity</h3>
<div style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
<div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
<span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
</div>
</div>
"""
return bar_html


# Sample texts
sample_text_en = "This is a sample text in English to demonstrate the ISCC-CODE generation."
sample_text_de = "Dies ist ein Beispieltext auf Deutsch, um die Erzeugung von ISCC-CODES zu demonstrieren."

custom_css = """
#chunked-text span.label {
text-transform: none !important;
}
"""

iscc_theme = gr.themes.Default(
font=[gr.themes.GoogleFont("Readex Pro")],
font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
radius_size=gr.themes.sizes.radius_none,
)

with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
with gr.Row(variant="panel"):
gr.Markdown(
"""
## ✂️ ISCC Semantic Text-Code
Demo of cross-lingual Semantic Text-Code (proof of concept)
""",
)
with gr.Row(variant="panel"):
in_iscc_bits = gr.Slider(
label="ISCC Bit-Length",
info="NUMBER OF BITS FOR OUTPUT ISCC",
minimum=64,
maximum=256,
step=32,
value=64,
)
with gr.Row(variant="panel"):
with gr.Column(variant="panel"):
in_text_a = gr.TextArea(
label="Text",
placeholder="Paste your text here or select sample from below",
lines=12,
max_lines=12,
)

gr.Examples(label="Sample Text", examples=[sample_text_en], inputs=[in_text_a])
out_code_a = gr.Textbox(label="ISCC Code for Text A")
with gr.Column(variant="panel"):
in_text_b = gr.TextArea(
label="Text",
placeholder="Paste your text here or select sample from below",
lines=12,
max_lines=12,
)

gr.Examples(label="Sample Text", examples=[sample_text_de], inputs=[in_text_b])
out_code_b = gr.Textbox(label="ISCC Code for Text B")

with gr.Row(variant="panel"):
with gr.Column(variant="panel"):
out_similarity = gr.HTML(label="Similarity")

def process_text(text, nbits, suffix):
log.debug(f"{text[:20]}")
if not text:
return
out_code_func = globals().get(f"out_code_{suffix}")
iscc = sct.Metadata(**sct.gen_text_code_semantic(text, bits=nbits))
result = {out_code_func: gr.Textbox(value=iscc.iscc)}
return result

in_text_a.change(
lambda text, nbits: process_text(text, nbits, "a"),
inputs=[in_text_a, in_iscc_bits],
outputs=[out_code_a],
show_progress="full",
)
in_text_b.change(
lambda text, nbits: process_text(text, nbits, "b"),
inputs=[in_text_b, in_iscc_bits],
outputs=[out_code_b],
show_progress="full",
)

out_code_a.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
out_code_b.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
with gr.Row():
gr.ClearButton(components=[in_text_a, in_text_b])


if __name__ == "__main__": # pragma: no cover
demo.launch()
58 changes: 56 additions & 2 deletions iscc_sct/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from base64 import b32encode
from pybase64 import urlsafe_b64encode
import math
from base64 import b32encode, b32decode
from pybase64 import urlsafe_b64encode, urlsafe_b64decode
from loguru import logger as log
import os
import time
Expand All @@ -21,6 +22,7 @@
"encode_base32",
"encode_base64",
"hamming_distance",
"iscc_distance",
"MODEL_PATH",
]

Expand Down Expand Up @@ -94,6 +96,18 @@ def encode_base32(data):
return b32encode(data).decode("ascii").rstrip("=")


def decode_base32(code):
# type: (str) -> bytes
"""
Standard RFC4648 base32 decoding without padding and with casefolding.
"""
# python stdlib does not support base32 without padding, so we have to re-pad.
cl = len(code)
pad_length = math.ceil(cl / 8) * 8 - cl

return bytes(b32decode(code + "=" * pad_length, casefold=True))


def encode_base64(data):
# type: (bytes) -> str
"""
Expand All @@ -103,6 +117,16 @@ def encode_base64(data):
return code.rstrip("=")


def decode_base64(code):
# type: (str) -> bytes
"""
Standard RFC4648 base64url decoding without padding.
"""
padding = 4 - (len(code) % 4)
string = code + ("=" * padding)
return urlsafe_b64decode(string)


def hamming_distance(a, b):
# type: (bytes, bytes) -> int
"""
Expand All @@ -122,3 +146,33 @@ def hamming_distance(a, b):
distance += bin(xor_result).count("1")

return distance


def iscc_distance(iscc1, iscc2):
# type: (str, str) -> int
"""
Calculate the Hamming distance between two ISCC Semantic Text Codes.
:param iscc1: The first ISCC Semantic Text Code.
:param iscc2: The second ISCC Semantic Text Code.
:return: The Hamming distance between the two ISCC codes.
:raise ValueError: If the input ISCCs are not valid or of different lengths.
"""
# Remove the "ISCC:" prefix if present
iscc1 = iscc1[5:] if iscc1.startswith("ISCC:") else iscc1
iscc2 = iscc2[5:] if iscc2.startswith("ISCC:") else iscc2

# Decode the base32-encoded ISCCs
decoded1 = decode_base32(iscc1)
decoded2 = decode_base32(iscc2)

# Check if the decoded ISCCs have the same length
if len(decoded1) != len(decoded2):
raise ValueError("The input ISCCs must have the same length")

# Remove the 2-byte header from each decoded ISCC
content1 = decoded1[2:]
content2 = decoded2[2:]

# Calculate and return the Hamming distance
return hamming_distance(content1, content2)
Loading

0 comments on commit 71fabc8

Please sign in to comment.