What is Tokenization Drift and How to Fix It?


words     = [p[1] for p in pairs]
ids_ws    = [tokenizer.encode(" " + w,  add_special_tokens=False)[0] for w in words]
ids_nws   = [tokenizer.encode(w, add_special_tokens=False)[0] for w in words]
delta     = [abs(a - b) for a, b in zip(ids_ws, ids_nws)]
 
x = np.arange(len(words))
width = 0.35
 
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.patch.set_facecolor("#FAFAF8")
 
# Left: side-by-side token IDs
ax = axes[0]
ax.set_facecolor("#FAFAF8")
bars1 = ax.bar(x - width/2, ids_ws,  width, label="With leading space",    color="#3B6FE0", alpha=0.85)
bars2 = ax.bar(x + width/2, ids_nws, width, label="Without leading space",  color="#E05C3B", alpha=0.85)
ax.set_xticks(x)
ax.set_xticklabels(words, rotation=30, ha="right", fontsize=9)
ax.set_ylabel("Token ID", fontsize=10)
ax.set_title("Token IDs: ' word'  vs  'word'", fontsize=12, fontweight="bold", pad=12)
ax.legend(fontsize=9)
ax.spines[["top", "right"]].set_visible(False)
ax.grid(axis="y", alpha=0.3)
 
for bar in bars1:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
            str(int(bar.get_height())), ha="center", va="bottom", fontsize=7, color="#3B6FE0")
for bar in bars2:
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
            str(int(bar.get_height())), ha="center", va="bottom", fontsize=7, color="#E05C3B")
 
# Right: delta
ax2 = axes[1]
ax2.set_facecolor("#FAFAF8")
color_bars = ["#E05C3B" if d > 500 else "#F0A070" if d > 100 else "#A8C4F0" for d in delta]
bars3 = ax2.bar(words, delta, color=color_bars, alpha=0.9)
ax2.set_ylabel("Absolute Token ID Distance", fontsize=10)
ax2.set_title("How Far Apart Are the Token IDs?", fontsize=12, fontweight="bold", pad=12)
ax2.set_xticklabels(words, rotation=30, ha="right", fontsize=9)
ax2.spines[["top", "right"]].set_visible(False)
ax2.grid(axis="y", alpha=0.3)
 
for bar, d in zip(bars3, delta):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 10,
             str(d), ha="center", va="bottom", fontsize=9, fontweight="bold")
 
high  = mpatches.Patch(color="#E05C3B", alpha=0.9, label="> 500 apart")
med   = mpatches.Patch(color="#F0A070", alpha=0.9, label="100-500 apart")
low   = mpatches.Patch(color="#A8C4F0", alpha=0.9, label="< 100 apart")
ax2.legend(handles=[high, med, low], fontsize=8)
 
plt.tight_layout(pad=2)
plt.suptitle("Tokenization Artifacts: One Space, Completely Different Token", 
             fontsize=14, fontweight="bold", y=1.02)
plt.savefig("tokenization_artifact.png", dpi=150, bbox_inches="tight", facecolor="#FAFAF8")
plt.show()



Source link

  • Related Posts

    Sakana AI Introduces KAME: A Tandem Speech-to-Speech Architecture That Injects LLM Knowledge in Real Time

    The fundamental tension in conversational AI has always been a binary choice: respond fast or respond smart. Real-time speech-to-speech (S2S) models — the kind that power natural-feeling voice assistants —…

    Mistral AI Launches Remote Agents in Vibe and Mistral Medium 3.5 with 77.6% SWE-Bench Verified Score

    Mistral AI has been quietly building one of the more practical coding agent ecosystems in the open-source/weights AI space, and they are shipping its most significant infrastructure upgrade yet. Mistral…

    Leave a Reply

    Your email address will not be published. Required fields are marked *