If we embed the text 'a cat' and the rendered PNG of three ASCII art strings — one recognizable cat shape, one plausible ASCII art of a different subject, and random noise — into Gemini Embedding 2's shared space, the cosine similarity scores will follow the ordering: good > bad > noise, without any human labels or subject-specific heuristics.
Three ASCII art strings represent three levels of semantic alignment with the text prompt 'a cat':
This is the classic ASCII cat: ears, eyes, nose, whisker stub. A human would immediately recognize it as a cat. The embedding model should too.
/\_/\
( o.o )
> ^ <
This is structurally valid ASCII art — it uses line-drawing characters consistently and depicts something coherent (a house with walls and a foundation). But it is not a cat. The metric should penalize the mismatch.
_____
| |
|_____|
| |
Random printable characters with no structural intent. No recognizable shape, no semantic content. This should score at or below the noise floor.
x@#$%^
&*()_+
!?><{}
The pipeline has four steps. Each is a small, focused function:
The embedding model accepts images, not text-as-text. We need to rasterize the ASCII art into a PNG before embedding. We use PIL with a monospace font (DejaVu Sans Mono) so character alignment is preserved.
FONT = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf", 20)
PAD = 16
def render(ascii_text: str) -> bytes:
lines = ascii_text.splitlines()
test_img = Image.new("RGB", (1, 1))
test_draw = ImageDraw.Draw(test_img)
line_height = FONT.getbbox("A")[3] + 4
max_w = max(test_draw.textlength(line, font=FONT) for line in lines)
w = int(max_w) + PAD * 2
h = line_height * len(lines) + PAD * 2
img = Image.new("RGB", (w, h), color=(255, 255, 255))
draw = ImageDraw.Draw(img)
y = PAD
for line in lines:
draw.text((PAD, y), line, fill=(0, 0, 0), font=FONT)
y += line_height
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
Call the Gemini embedding API with the text prompt and SEMANTIC_SIMILARITY task type. This returns a 3072-dimensional float vector.
def embed_text(text: str) -> np.ndarray:
result = client.models.embed_content(
model=MODEL,
contents=[text],
config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY"),
)
return np.array(result.embeddings[0].values, dtype=np.float32)
Call the same API with the PNG bytes as a Part. Note: no task type for image embeddings — the model infers the modality from the mime type.
def embed_image(png_bytes: bytes) -> np.ndarray:
result = client.models.embed_content(
model=MODEL,
contents=[types.Part.from_bytes(data=png_bytes, mime_type="image/png")],
)
return np.array(result.embeddings[0].values, dtype=np.float32)
Standard cosine: dot product divided by product of norms. Returns a scalar in [−1, 1].
def cosine(a: np.ndarray, b: np.ndarray) -> float:
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
All four steps composed into a single callable:
def ascii_metric(description: str, ascii_art: str) -> float:
text_vec = embed_text(description)
image_vec = embed_image(render(ascii_art))
return cosine(text_vec, image_vec)
The scores confirm the hypothesis:
good (0.404) > bad (0.366) > noise (0.296). The embedding space correctly rank-orders the three ASCII art strings by semantic alignment with the text prompt, without any subject-specific heuristics or human labels.
The absolute values look low if you are used to within-modality similarity scores, where well-matched pairs often score above 0.85. Cross-modal similarity is structurally lower — text and image embeddings live in overlapping but distinct regions of the space. A score of 0.40 for a text-image pair represents strong alignment.
What matters more than absolute value is spread: the difference between good and noise is 0.108, which is a clear signal well above any reasonable noise floor. The noise floor in practice is determined by random vector pairs, which in 3072 dimensions cluster tightly around 0.0. A cross-modal score of 0.30 is already well above that baseline.
The spread also has an important property: it is monotonically related to semantic quality. You do not need to threshold the scores or calibrate them to get useful signal. Any optimizer that wants to maximize the score will automatically prefer outputs that are more semantically aligned with the input description.
This is a proof of concept, not a study.
The full runnable proof-of-concept script:
"""
Proof of concept: multimodal embedding as ASCII art evaluation metric.
Three ASCII representations of a cat, ranked by quality:
good - recognizable cat shape
bad - plausible ASCII but wrong subject (a house)
noise - random characters
We embed the text "a cat" and each rendered PNG in Gemini's shared space,
then measure cosine similarity. The hypothesis: good > bad > noise.
"""
import os
import io
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from google import genai
from google.genai import types
# ── subjects ────────────────────────────────────────────────────────────────
DESCRIPTION = "a cat"
ASCII_GOOD = """\
/\\_/\\
( o.o )
> ^ <
"""
ASCII_BAD = """\
_____
| |
|_____|
| |
"""
ASCII_NOISE = """\
x@#$%^
&*()_+
!?><{}
"""
# ── render ASCII → PNG ───────────────────────────────────────────────────────
FONT = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf", 20)
PAD = 16
def render(ascii_text: str) -> bytes:
lines = ascii_text.splitlines()
# measure
test_img = Image.new("RGB", (1, 1))
test_draw = ImageDraw.Draw(test_img)
line_height = FONT.getbbox("A")[3] + 4
max_w = max(test_draw.textlength(line, font=FONT) for line in lines)
w = int(max_w) + PAD * 2
h = line_height * len(lines) + PAD * 2
# draw
img = Image.new("RGB", (w, h), color=(255, 255, 255))
draw = ImageDraw.Draw(img)
y = PAD
for line in lines:
draw.text((PAD, y), line, fill=(0, 0, 0), font=FONT)
y += line_height
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
# ── embed ────────────────────────────────────────────────────────────────────
client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])
MODEL = "gemini-embedding-2-preview"
def embed_text(text: str) -> np.ndarray:
result = client.models.embed_content(
model=MODEL,
contents=[text],
config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY"),
)
return np.array(result.embeddings[0].values, dtype=np.float32)
def embed_image(png_bytes: bytes) -> np.ndarray:
result = client.models.embed_content(
model=MODEL,
contents=[types.Part.from_bytes(data=png_bytes, mime_type="image/png")],
)
return np.array(result.embeddings[0].values, dtype=np.float32)
def cosine(a: np.ndarray, b: np.ndarray) -> float:
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
# ── metric ────────────────────────────────────────────────────────────────────
def ascii_metric(description: str, ascii_art: str) -> float:
text_vec = embed_text(description)
image_vec = embed_image(render(ascii_art))
return cosine(text_vec, image_vec)
# ── run ───────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
cases = [
("good (cat shape)", ASCII_GOOD),
("bad (house shape)", ASCII_BAD),
("noise (random chars)", ASCII_NOISE),
]
print(f"Description: '{DESCRIPTION}'\n")
print(f"{'Label':<26} {'Score':>8} {'Bar'}")
print("-" * 60)
scores = {}
for label, art in cases:
score = ascii_metric(DESCRIPTION, art)
scores[label] = score
bar = "█" * int(score * 40)
print(f"{label:<26} {score:>8.4f} {bar}")
print()
best = max(scores, key=scores.get)
print(f"Highest: {best}")
good_score = scores["good (cat shape)"]
bad_score = scores["bad (house shape)"]
noise_score = scores["noise (random chars)"]
hypothesis = good_score > bad_score > noise_score
print(f"Hypothesis (good > bad > noise): {hypothesis}")
print(f" good={good_score:.4f} bad={bad_score:.4f} noise={noise_score:.4f}")