Design a Complete Multimodal RLVR Pipeline with Open-MM-RL, Vision-Language Prompting, Reward Scoring, and GRPO Export


EXTRACT_PATS = [
   r"\\boxed\{([^{}]+)\}",
   r"final\s+answer\s*[:=]\s*([^\n]+)",
   r"answer\s*[:=]\s*([^\n]+)",
]
def extract_final(text):
   if not text: return ""
   for p in EXTRACT_PATS:
       m = re.search(p, text, flags=re.IGNORECASE)
       if m: return m.group(1).strip().strip(".,;")
   lines = [l.strip() for l in str(text).strip().splitlines() if l.strip()]
   return lines[-1] if lines else ""
def latex_to_sympy(s):
   s = (s or "").strip().strip("$").strip()
   s = re.sub(r"^\\[\[\(]", "", s); s = re.sub(r"\\[\]\)]$", "", s)
   s = (s.replace("\\pi", "pi").replace("\\cdot", "*").replace("\\times", "*")
          .replace("\\,", "").replace("\\;", "").replace("\\!", ""))
   s = re.sub(r"\\frac\s*\{([^{}]+)\}\s*\{([^{}]+)\}", r"((\1)/(\2))", s)
   s = re.sub(r"\\sqrt\s*\{([^{}]+)\}", r"sqrt(\1)", s)
   s = s.replace("^", "**")
   s = re.sub(r"\\[a-zA-Z]+", "", s)
   s = s.replace("{", "(").replace("}", ")")
   return s
def grade(pred, gold, tol=1e-4):
   """Verifiable reward in [0,1]: exact > numeric > sympy-symbolic > partial."""
   if pred is None or gold is None: return 0.0
   p = extract_final(str(pred)).strip()
   g = str(gold).strip()
   norm = lambda x: re.sub(r"\s+", "", x.lower()).strip("$.,;[]()")
   if norm(p) == norm(g): return 1.0
   def to_float(x):
       try: return float(latex_to_sympy(x))
       except Exception:
           try: return float(sp.sympify(latex_to_sympy(x)).evalf())
           except Exception: return None
   fp, fg = to_float(p), to_float(g)
   if fp is not None and fg is not None:
       if abs(fp - fg) / max(1.0, abs(fg)) < tol: return 1.0
   try:
       ep = sp.sympify(latex_to_sympy(p)); eg = sp.sympify(latex_to_sympy(g))
       if sp.simplify(ep - eg) == 0: return 1.0
   except Exception:
       pass
   if norm(g) and norm(g) in norm(p): return 0.5
   return 0.0
print("\n=== Grader sanity checks ===")
for pred, gold, want in [
   ("The answer is \\boxed{120}",            "[120]",            1.0),
   ("After computing: 7396 \\pi",            "7396\\pi",         1.0),
   ("Final answer: -71/4",                   "-\\frac{71}{4}",   1.0),
   ("Therefore the result is 0.0074",        "0.0074",           1.0),
   ("Final answer: nucleus accumbens",       "Nucleus accumbens",1.0),
   ("I don't know",                          "12",               0.0),
]:
   print(f"  pred={pred[:38]!r:42s} gold={gold!r:22s} -> r={grade(pred, gold)}  (want {want})")
SYSTEM = ("You are a STEM expert solving multimodal reasoning problems. "
         "You will see a question and one or more figures. "
         "Reason step by step, then end with exactly one line:\n"
         "Final answer: ")
def build_prompt(ex):
   img_tags = "\n".join(f"[Image {i+1}]" for i in range(len(ex["images"])))
   return f"{SYSTEM}\n\n{img_tags}\n\nQuestion:\n{ex['question']}\n\nLet's think step by step."
print("\n=== Example prompt (truncated) ===")
print(build_prompt(ds[0])[:600], "...\n")



Source link

  • Related Posts

    Together AI Open-Sources OSCAR: An Attention-Aware 2-Bit KV Cache Quantization System for Long-Context LLM Serving

    Long-context inference makes the KV cache one of the main costs of serving LLMs. During autoregressive decoding, the cache grows with context length, batch size, and model depth. At high…

    Step by Step Guide to Build and Compare FedAvg and FedProx Federated Learning on Non-IID CIFAR-10 with NVIDIA FLARE

    CLIENT_SCRIPT += r”’ def main(): p = argparse.ArgumentParser() p.add_argument(“–num_sites”, type=int, default=3) p.add_argument(“–alpha”, type=float, default=0.3) p.add_argument(“–local_epochs”, type=int, default=1) p.add_argument(“–mu”, type=float, default=0.0) p.add_argument(“–max_samples”, type=int, default=4000) p.add_argument(“–batch_size”, type=int, default=64) p.add_argument(“–lr”, type=float, default=0.01) p.add_argument(“–data_root”,…

    Leave a Reply

    Your email address will not be published. Required fields are marked *