How to Build a Vision-Guided Web AI Agent with MolmoWeb-4B Using Multimodal Reasoning and Action Prediction

def parse_click_coords(action_str): “”” Extracts the normalized (x, y) coordinates from the press motion string. Instance: ‘click on(0.45, 0.32)’ -> (0.45, 0.32) Returns None if the motion just isn’t a click on. “”” match = re.search(r”click on(s*([d.]+)s*,s*([d.]+)s*)”, action_str) if match: return float(match.group(1)), float(match.group(2)) return None def parse_action_details(action_str): “”” MolmoWeb Parses an motion string right into a structured dictionary. Returns: {“kind”: “click on”, “x”: 0.45, “y”: 0.32} {“kind”: “goto”, “url”: “https://…”} {“kind”: “kind”, “textual content”: “question textual content”} {“kind”: “scroll”, “route”: “down”} {“kind”: “press”, “key”: “Enter”} {“kind”: “send_msg”, “message”: “The reply is…”} {“kind”: “unknown”, “uncooked”: “…”} “”” action_str = action_str.strip() m = re.match(r’click on(s*([d.]+)s*,s*([d.]+)s*)’, action_str) if m: return {“kind”: “click on”, “x”: float(m.group(1)), “y”: float(m.group(2))} m = re.match(r’goto(s*[“‘](.+?)[“‘]s*)’, action_str) if m: return {“kind”: “goto”, “url”: m.group(1)} m = re.match(r’kind(s*[“‘](.+?)[“‘]s*)’, action_str) if m: return {“kind”: “kind”, “textual content”: m.group(1)} m = re.match(r’scroll(s*[“‘]?(Prime | Backside)[“‘]?s*)’, action_str) if m: return {“kind”: “scroll”, “route”: m.group(1)} m = re.match(r’press(s*[“‘](.+?)[“‘]s*)’, action_str) if m: return {“kind”: “press”, “key”: m.group(1)} m = re.match(r’send_msg(s*[“‘](.+?)[“‘]s*)’, action_str, re.DOTALL) if m: return {“kind”: “send_msg”, “message”: m.group(1)} m = re.match(r'(new_tab|go_back|switch_tab)(s*(d*)s*)’, action_str) if m: outcome = {“kind”: m.group(1)} if m.group(2): outcome[“tab”] = int(m.group(2)) return outcome return {“kind”: “unknown”, “uncooked”: action_str} def Visualise_click(picture, action_str, title=”MolmoWeb Prediction”): “”” Shows the anticipated click on location drawn on the screenshot. Coordinates are normalized (0-1) and transformed to pixel house. “”” coords = parse_click_coords(action_str) fig, ax = plt.subplots(1, 1, figsize=(12, 7)) ax.imshow(picture) ax.set_title(title, fontsize=14) if coords: x_norm, y_norm = coords w, h = picture.dimension x_px, y_px = x_norm * w, y_norm * h Circle = patches.Circle( (x_px, y_px), radius=18, linewidth=3,edgecolor=”pink”, facecolor=”none” ) ax.add_patch(circle) ax.plot(x_px, y_px, “r+”, markersize=20, markeredgewidth=3) ax.annotate( f”click on({x_norm:.3f}, {y_norm:.3f})”, (x_px, y_px), xytext=(x_px + 25, y_px – 25), fontsize=11, shade=”white”, bbox=dict(boxstyle=”spherical,pad=0.3″, facecolor=”pink”, alpha=0.8), arrowprops=dict(arrowstyle=”->”, shade=”pink”, lw=2), ) else: ax.textual content( 0.5, 0.02, f”motion: {action_str}”, rework=ax.transAxes, fontsize=12, ha=”heart”, shade=”white”, bbox=dict(boxstyle=”spherical,pad=0.4″, facecolor=”blue”, alpha=0.8), ) ax.axis(“off”) plt.tight_layout() plt.present() def download_image(url, dimension=(1280, 720)): “””Downloads a picture from a URL and resizes it to suit the size of the browser’s viewport.””” response =requests.get(url, timeout=15) img = Picture.open(BytesIO(response.content material)).convert(“RGB”) img = img.resize(dimension, Picture.LANCZOS) return img def create_synthetic_webpage(title=”Pattern Web page”, components=None): “”” Create a screenshot of an artificial net web page for testing. ‘components’ is a listing of dictionaries: “enter” “”” img = Picture.new(“RGB”, (1280, 720), shade=(255, 255, 255))draw = ImageDraw.Draw(img)draw.rectangle([0, 0, 1280, 50]fill=(240, 240, 240))draw.rectangle([180, 10, 900, 40]define=(200, 200, 200), width=1, fill=”white”)draw.textual content((200, 16), f”https://www.instance.com”, fill=(100, 100, 100)) for cx in [30, 60, 90]:draw.ellipse([cx – 8, 17, cx + 8, 33]fill=(200, 200, 200))draw.textual content((50, 70), title, fill=”black”) if component: for el in components: x, y = el[“pos”]
If Elle[“type”] == “button”:draw.rectangle([x, y, x + 150, y + 35]fill=(66, 133, 244))draw.textual content((x + 10, y + 8), el[“text”]fill=”white”) Elif El[“type”] == “enter”:draw.rectangle([x, y, x + 300, y + 35]define=(180, 180, 180), width=2)draw.textual content((x + 10, y + 8), el[“text”]fill=(150, 150, 150)) elif el[“type”] == “textual content”:draw.textual content((x, y), el[“text”]fill=”black”) Elif El[“type”] == “hyperlink”:draw.textual content((x, y), el[“text”]fill=(66, 133, 244)) return img print(“Helper perform outlined efficiently.”) print(“n” + “= * 70) print(“Part 5: Single-Step Inference – Clean Web page (Chilly Begin)”) print(“=” * 70) print(“The agent begins at about:clean and should resolve on its first motion.n”)blank_image = Picture.new(“RGB”, (1280, 720), shade=”white”) process = “Go to arxiv.org to seek out Ai2’s newest paper on Molmo. ” Immediate = build_prompt( task_description=process, page_url=”about:clean”, page_index=0, ) print(f”Job: {process}”) print(“Screenshot: Clean white picture (about:clean)”) print(“Working”Inference…n”) raw_output = run_inference(immediate,blank_image) print(f”Uncooked mannequin output:n{raw_output}n”) parsed = parse_thought_and_action(raw_output) print(f”Thought: {parsed[‘thought’]}”) print(f” motion: {parsed[‘action’]}”) action_details = parse_action_details(parsed)[“action”]) print(f”Parsed: {action_details}”)

How to Build a Vision-Guided Web AI Agent with MolmoWeb-4B Using Multimodal Reasoning and Action Prediction

Leave a Reply Cancel reply

Follow US

Popular News

Ramonainv (ramonainv.com) program details. Reviews, Scam or Paying

NVIDIA Researchers Propose Reinforcement Learning Pretraining (RLP): Reinforcement as a Pretraining Objective for Building Reasoning During Pretraining

10 Essential Alfred Hitchcock Movies Everyone Should See

The Matrix 5 Update: Script Progress and Drew Goddard’s Vision

Steph Curry Celebrates Ayesha Curry’s 37th Birthday, Fans React

Categories

About US

Quick Links

Important Links

Subscribe US