A Coding Implementation of MolmoAct for Depth-Aware Spatial Reasoning, Visual Trajectory Tracing, and Robotic Action Prediction


class MolmoActVisualizer:
   """Visualization utilities for MolmoAct outputs"""
  
   def __init__(self, figsize: Tuple[int, int] = (12, 8)):
       self.figsize = figsize
       self.colors = plt.cm.viridis(np.linspace(0, 1, 10))
  
   def plot_trace(
       self,
       image: Image.Image,
       trace: List[List[int]],
       title: str = "Visual Reasoning Trace",
       save_path: Optional[str] = None
   ) -> None:
       """Plot visual trace overlaid on image"""
       fig, ax = plt.subplots(figsize=self.figsize)
      
       img_array = np.array(image)
       ax.imshow(img_array)
      
       if trace and len(trace) > 0:
           h, w = img_array.shape[:2]
           trace_array = np.array(trace)
          
           x_coords = trace_array[:, 0] * w / 256
           y_coords = trace_array[:, 1] * h / 256
          
           ax.plot(x_coords, y_coords, 'w-', linewidth=2, alpha=0.7)
           ax.plot(x_coords, y_coords, 'c-', linewidth=1, alpha=0.9)
          
           for i, (x, y) in enumerate(zip(x_coords, y_coords)):
               color_idx = int(i * 9 / max(len(x_coords) - 1, 1))
               ax.scatter(x, y, c=[self.colors[color_idx]], s=100,
                         edgecolors="white", linewidths=2, zorder=5)
               ax.annotate(f'{i+1}', (x, y), textcoords="offset points",
                          xytext=(5, 5), fontsize=10, color="white",
                          fontweight="bold")
          
           ax.scatter(x_coords[0], y_coords[0], c="lime", s=200,
                     marker="o", edgecolors="white", linewidths=3,
                     zorder=6, label="Start")
           ax.scatter(x_coords[-1], y_coords[-1], c="red", s=200,
                     marker="X", edgecolors="white", linewidths=3,
                     zorder=6, label="End")
      
       ax.set_title(title, fontsize=14, fontweight="bold")
       ax.axis('off')
       ax.legend(loc="upper right")
      
       plt.tight_layout()
      
       if save_path:
           plt.savefig(save_path, dpi=150, bbox_inches="tight")
           print(f"💾 Saved visualization to {save_path}")
      
       plt.show()
  
   def plot_action(
       self,
       action: List[float],
       action_labels: Optional[List[str]] = None,
       title: str = "Predicted Robot Action",
       save_path: Optional[str] = None
   ) -> None:
       """Plot action values as a bar chart"""
       if action_labels is None:
           action_labels = [
               'Δx (forward)', 'Δy (left)', 'Δz (up)',
               'Rx (roll)', 'Ry (pitch)', 'Rz (yaw)',
               'Gripper'
           ]
      
       fig, ax = plt.subplots(figsize=(10, 5))
      
       colors = ['#3498db', '#3498db', '#3498db',
                 '#e74c3c', '#e74c3c', '#e74c3c',
                 '#2ecc71']
      
       x = np.arange(len(action))
       bars = ax.bar(x, action, color=colors, edgecolor="white", linewidth=1.5)
      
       for bar, val in zip(bars, action):
           height = bar.get_height()
           ax.annotate(f'{val:.3f}',
                      xy=(bar.get_x() + bar.get_width() / 2, height),
                      xytext=(0, 3 if height >= 0 else -12),
                      textcoords="offset points",
                      ha="center", va="bottom" if height >= 0 else 'top',
                      fontsize=9, fontweight="bold")
      
       ax.set_xticks(x)
       ax.set_xticklabels(action_labels, rotation=45, ha="right")
       ax.set_ylabel('Value', fontsize=12)
       ax.set_title(title, fontsize=14, fontweight="bold")
       ax.axhline(y=0, color="gray", linestyle="--", alpha=0.5)
       ax.grid(axis="y", alpha=0.3)
      
       from matplotlib.patches import Patch
       legend_elements = [
           Patch(facecolor="#3498db", label="Position"),
           Patch(facecolor="#e74c3c", label="Rotation"),
           Patch(facecolor="#2ecc71", label="Gripper")
       ]
       ax.legend(handles=legend_elements, loc="upper right")
      
       plt.tight_layout()
      
       if save_path:
           plt.savefig(save_path, dpi=150, bbox_inches="tight")
      
       plt.show()



Source link

  • Related Posts

    MiniMax Just Open Sourced MiniMax M2.7: A Self-Evolving Agent Model that Scores 56.22% on SWE-Pro and 57.0% on Terminal Bench 2

    MiniMax has officially open-sourced MiniMax M2.7, making the model weights publicly available on Hugging Face. Originally announced on March 18, 2026, MiniMax M2.7 is the MiniMax’s most capable open-source model…

    Liquid AI Releases LFM2.5-VL-450M: a 450M-Parameter Vision-Language Model with Bounding Box Prediction, Multilingual Support, and Sub-250ms Edge Inference

    Liquid AI just released LFM2.5-VL-450M, an updated version of its earlier LFM2-VL-450M vision-language model. The new release introduces bounding box prediction, improved instruction following, expanded multilingual understanding, and function calling…

    Leave a Reply

    Your email address will not be published. Required fields are marked *