File size: 13,018 Bytes

da76488

"""
Chain-of-Sight (CoS) Integration Example

Demonstrates visual grounding with region tokens and bounding boxes.
Inspired by: https://github.com/baaivision/CoS

This shows how to use region tokens for spatially-aware reasoning.
"""

import sys
from pathlib import Path
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from enum import Enum

# Direct imports to avoid accelerate dependency
sys.path.insert(0, str(Path(__file__).parent.parent))

# Import just what we need without loading the full framework
import importlib.util
spec = importlib.util.spec_from_file_location(
    "step_data",
    Path(__file__).parent.parent / "src" / "reasoning" / "step_data.py"
)
step_data = importlib.util.module_from_spec(spec)

# Inject minimal dependencies
import torch
step_data.torch = torch

# Load the module
spec.loader.exec_module(step_data)

# Import what we need
ReasoningStep = step_data.ReasoningStep
ReasoningChain = step_data.ReasoningChain
StepType = step_data.StepType
SPECIAL_TOKENS = step_data.SPECIAL_TOKENS
REGION_TOKENS = step_data.REGION_TOKENS


def example_visual_grounding():
    """
    Example: Visual grounding with region tokens.
    
    Question: "How many red apples are on the left side of the basket?"
    """
    print("=" * 70)
    print("EXAMPLE 1: Visual Grounding with Region Tokens (Chain-of-Sight)")
    print("=" * 70)
    
    # Step 1: Perceive and localize the basket
    step1 = ReasoningStep(
        step_id=0,
        step_type=StepType.PERCEPTION,
        description="I observe a woven basket in the center of the image",
        confidence=0.95,
        region_ids=[1],  # Refers to <region1>
        bounding_boxes=[[0.3, 0.2, 0.7, 0.8]]  # [x1, y1, x2, y2] normalized coords
    )
    
    # Step 2: Localize left side
    step2 = ReasoningStep(
        step_id=1,
        step_type=StepType.LOCALIZATION,
        description="I focus on the left side of the basket",
        confidence=0.92,
        dependencies=[0],
        region_ids=[2],  # Refers to <region2>
        bounding_boxes=[[0.3, 0.2, 0.5, 0.8]]  # Left half of basket
    )
    
    # Step 3: Identify red apples
    step3 = ReasoningStep(
        step_id=2,
        step_type=StepType.PERCEPTION,
        description="I identify red-colored apples in this region",
        confidence=0.90,
        dependencies=[1],
        region_ids=[3, 4, 5],  # Three apple regions
        bounding_boxes=[
            [0.32, 0.25, 0.42, 0.35],  # Apple 1
            [0.35, 0.40, 0.45, 0.50],  # Apple 2
            [0.38, 0.60, 0.48, 0.70],  # Apple 3
        ]
    )
    
    # Step 4: Count the apples
    step4 = ReasoningStep(
        step_id=3,
        step_type=StepType.COUNTING,
        description="Counting red apples in regions 3, 4, 5: Total of 3 apples",
        confidence=0.88,
        dependencies=[2],
        region_ids=[3, 4, 5]
    )
    
    # Create reasoning chain
    chain = ReasoningChain(
        chain_id="cos_example_001",
        image_path="data/basket_apples.jpg",
        prompt="How many red apples are on the left side of the basket?",
        steps=[step1, step2, step3, step4],
        final_answer="There are 3 red apples on the left side of the basket",
        is_correct=True
    )
    
    print("\n📝 Question:", chain.prompt)
    print("\n🔍 Reasoning Steps with Visual Grounding:")
    print("-" * 70)
    
    for i, step in enumerate(chain.steps, 1):
        print(f"\nStep {i} ({step.step_type.value}):")
        print(f"  Description: {step.description}")
        print(f"  Confidence: {step.confidence:.2f}")
        
        if step.region_ids:
            print(f"  Regions: {', '.join([f'<region{r}>' for r in step.region_ids])}")
        
        if step.bounding_boxes:
            print(f"  Bounding Boxes: {len(step.bounding_boxes)} box(es)")
            for j, bbox in enumerate(step.bounding_boxes, 1):
                print(f"    Box {j}: [{bbox[0]:.3f}, {bbox[1]:.3f}, {bbox[2]:.3f}, {bbox[3]:.3f}]")
        
        if step.dependencies:
            print(f"  Depends on: Step(s) {', '.join(map(str, step.dependencies))}")
    
    print(f"\n✅ Final Answer: {chain.final_answer}")
    
    # Show formatted output with special tokens
    print("\n" + "=" * 70)
    print("FORMATTED OUTPUT (with special tokens):")
    print("=" * 70)
    formatted = chain.format_with_tokens()
    print(formatted)
    print()
    
    return chain


def example_spatial_comparison():
    """
    Example: Comparing objects in different regions.
    
    Question: "Is the red apple bigger than the green apple?"
    """
    print("\n" + "=" * 70)
    print("EXAMPLE 2: Spatial Comparison with Regions")
    print("=" * 70)
    
    # Step 1: Locate red apple
    step1 = ReasoningStep(
        step_id=0,
        step_type=StepType.LOCALIZATION,
        description="I locate the red apple in the upper left region",
        confidence=0.93,
        region_ids=[1],
        bounding_boxes=[[0.1, 0.1, 0.3, 0.3]]
    )
    
    # Step 2: Locate green apple
    step2 = ReasoningStep(
        step_id=1,
        step_type=StepType.LOCALIZATION,
        description="I locate the green apple in the lower right region",
        confidence=0.91,
        region_ids=[2],
        bounding_boxes=[[0.6, 0.6, 0.8, 0.8]]
    )
    
    # Step 3: Compare sizes
    step3 = ReasoningStep(
        step_id=2,
        step_type=StepType.COMPARISON,
        description="Comparing bounding box sizes: red apple area ≈ 0.04, green apple area ≈ 0.04, approximately equal",
        confidence=0.87,
        dependencies=[0, 1],
        region_ids=[1, 2]
    )
    
    # Step 4: Inference
    step4 = ReasoningStep(
        step_id=3,
        step_type=StepType.INFERENCE,
        description="Based on similar bounding box areas, the apples are approximately the same size",
        confidence=0.85,
        dependencies=[2]
    )
    
    chain = ReasoningChain(
        chain_id="cos_example_002",
        image_path="data/two_apples.jpg",
        prompt="Is the red apple bigger than the green apple?",
        steps=[step1, step2, step3, step4],
        final_answer="No, the red apple and green apple are approximately the same size",
        is_correct=True
    )
    
    print("\n📝 Question:", chain.prompt)
    print("\n🔍 Reasoning with Region Comparison:")
    print("-" * 70)
    
    for i, step in enumerate(chain.steps, 1):
        print(f"\nStep {i} ({step.step_type.value}):")
        print(f"  {step.description}")
        if step.region_ids:
            print(f"  → References: {', '.join([f'<region{r}>' for r in step.region_ids])}")
    
    print(f"\n✅ Answer: {chain.final_answer}")
    print("\nFormatted:")
    print(chain.format_with_tokens())
    print()
    
    return chain


def example_multi_region_composition():
    """
    Example: Compositional reasoning across multiple regions.
    
    Question: "Describe the arrangement of fruits in the bowl"
    """
    print("\n" + "=" * 70)
    print("EXAMPLE 3: Multi-Region Compositional Reasoning")
    print("=" * 70)
    
    steps = [
        ReasoningStep(
            step_id=0,
            step_type=StepType.PERCEPTION,
            description="I observe a ceramic bowl containing multiple types of fruit",
            confidence=0.96,
            region_ids=[1],
            bounding_boxes=[[0.2, 0.2, 0.8, 0.8]]
        ),
        ReasoningStep(
            step_id=1,
            step_type=StepType.LOCALIZATION,
            description="Top layer: three oranges arranged in a triangle",
            confidence=0.92,
            dependencies=[0],
            region_ids=[2, 3, 4],
            bounding_boxes=[
                [0.35, 0.25, 0.45, 0.35],  # Orange 1
                [0.50, 0.25, 0.60, 0.35],  # Orange 2
                [0.42, 0.35, 0.52, 0.45],  # Orange 3
            ]
        ),
        ReasoningStep(
            step_id=2,
            step_type=StepType.LOCALIZATION,
            description="Middle layer: two apples positioned side by side",
            confidence=0.90,
            dependencies=[0],
            region_ids=[5, 6],
            bounding_boxes=[
                [0.30, 0.50, 0.40, 0.60],  # Apple 1
                [0.55, 0.50, 0.65, 0.60],  # Apple 2
            ]
        ),
        ReasoningStep(
            step_id=3,
            step_type=StepType.LOCALIZATION,
            description="Bottom layer: one banana resting at the base",
            confidence=0.88,
            dependencies=[0],
            region_ids=[7],
            bounding_boxes=[[0.35, 0.65, 0.60, 0.75]]
        ),
        ReasoningStep(
            step_id=4,
            step_type=StepType.COMPOSITION,
            description="The fruits are arranged in three distinct layers: oranges on top, apples in middle, banana at bottom",
            confidence=0.85,
            dependencies=[1, 2, 3],
            region_ids=[2, 3, 4, 5, 6, 7]
        ),
    ]
    
    chain = ReasoningChain(
        chain_id="cos_example_003",
        image_path="data/fruit_bowl.jpg",
        prompt="Describe the arrangement of fruits in the bowl",
        steps=steps,
        final_answer="The bowl contains fruits arranged in three layers: three oranges on top, two apples in the middle, and one banana at the bottom",
        is_correct=True
    )
    
    print("\n📝 Question:", chain.prompt)
    print("\n🔍 Multi-Region Compositional Analysis:")
    print("-" * 70)
    
    for step in chain.steps:
        print(f"\nStep {step.step_id} ({step.step_type.value}):")
        print(f"  {step.description}")
        if step.region_ids:
            print(f"  → Regions: {len(step.region_ids)} region(s) - {step.region_ids}")
        if step.bounding_boxes:
            print(f"  → Bboxes: {len(step.bounding_boxes)} box(es)")
    
    print(f"\n✅ Answer: {chain.final_answer}")
    print()
    
    return chain


def demonstrate_region_token_benefits():
    """Show benefits of Chain-of-Sight approach."""
    print("\n" + "=" * 70)
    print("BENEFITS OF CHAIN-OF-SIGHT REGION TOKENS")
    print("=" * 70)
    
    benefits = [
        ("🎯 Spatial Grounding", "Explicit reference to image regions with <region1>, <region2>, etc."),
        ("📍 Precise Localization", "Bounding boxes provide exact spatial coordinates"),
        ("🔗 Region Relationships", "Steps can reference and relate multiple regions"),
        ("🧩 Compositional Reasoning", "Build understanding by composing information from regions"),
        ("✅ Verifiable", "Can validate reasoning by checking bbox alignment with claims"),
        ("🔄 Reusable", "Regions established early can be referenced in later steps"),
        ("🎨 Visual Attention", "Makes model's visual focus explicit and interpretable"),
    ]
    
    for title, desc in benefits:
        print(f"\n{title}")
        print(f"  → {desc}")
    
    print("\n" + "=" * 70)
    print("INTEGRATION WITH YOUR FRAMEWORK")
    print("=" * 70)
    
    integration = [
        "✅ Region tokens (<region1>-<region10>) added to SPECIAL_TOKENS",
        "✅ Bounding box support in ReasoningStep",
        "✅ Automatic formatting with <|region|> and <|bbox|> markers",
        "✅ Compatible with PRM evaluation (regions provide grounding)",
        "✅ Works with RL training (region-aware rewards)",
        "✅ Enhances inference-time scaling (verify region consistency)",
    ]
    
    for item in integration:
        print(f"  {item}")
    
    print()


def show_token_list():
    """Display all available tokens including region tokens."""
    print("\n" + "=" * 70)
    print("COMPLETE TOKEN LIST (with Chain-of-Sight additions)")
    print("=" * 70)
    
    print(f"\n📋 Total tokens: {len(SPECIAL_TOKENS)}")
    print("\n🔤 All special tokens:")
    for i, token in enumerate(SPECIAL_TOKENS, 1):
        print(f"  {i:2d}. {token}")
    
    print("\n🎨 Region tokens specifically:")
    for token in REGION_TOKENS:
        print(f"  • {token}")
    
    print()


def main():
    """Run all Chain-of-Sight examples."""
    print("\n" + "=" * 70)
    print("CHAIN-OF-SIGHT INTEGRATION WITH STEP-LEVEL COT")
    print("Inspired by: https://github.com/baaivision/CoS")
    print("=" * 70)
    
    # Show token list
    show_token_list()
    
    # Run examples
    chain1 = example_visual_grounding()
    chain2 = example_spatial_comparison()
    chain3 = example_multi_region_composition()
    
    # Show benefits
    demonstrate_region_token_benefits()
    
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print(f"✅ Generated {3} example reasoning chains")
    print(f"✅ Demonstrated visual grounding with region tokens")
    print(f"✅ Showed bounding box integration")
    print(f"✅ Examples saved to chains: {[chain1.chain_id, chain2.chain_id, chain3.chain_id]}")
    print("\n💡 Use these patterns for training vision-language models with spatial reasoning!")
    print()


if __name__ == "__main__":
    main()