| """ |
| Chain-of-Sight (CoS) Integration Example |
| |
| Demonstrates visual grounding with region tokens and bounding boxes. |
| Inspired by: https://github.com/baaivision/CoS |
| |
| This shows how to use region tokens for spatially-aware reasoning. |
| """ |
|
|
| import sys |
| from pathlib import Path |
| from dataclasses import dataclass, field |
| from typing import List, Dict, Any, Optional |
| from enum import Enum |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
| |
| import importlib.util |
| spec = importlib.util.spec_from_file_location( |
| "step_data", |
| Path(__file__).parent.parent / "src" / "reasoning" / "step_data.py" |
| ) |
| step_data = importlib.util.module_from_spec(spec) |
|
|
| |
| import torch |
| step_data.torch = torch |
|
|
| |
| spec.loader.exec_module(step_data) |
|
|
| |
| ReasoningStep = step_data.ReasoningStep |
| ReasoningChain = step_data.ReasoningChain |
| StepType = step_data.StepType |
| SPECIAL_TOKENS = step_data.SPECIAL_TOKENS |
| REGION_TOKENS = step_data.REGION_TOKENS |
|
|
|
|
| def example_visual_grounding(): |
| """ |
| Example: Visual grounding with region tokens. |
| |
| Question: "How many red apples are on the left side of the basket?" |
| """ |
| print("=" * 70) |
| print("EXAMPLE 1: Visual Grounding with Region Tokens (Chain-of-Sight)") |
| print("=" * 70) |
| |
| |
| step1 = ReasoningStep( |
| step_id=0, |
| step_type=StepType.PERCEPTION, |
| description="I observe a woven basket in the center of the image", |
| confidence=0.95, |
| region_ids=[1], |
| bounding_boxes=[[0.3, 0.2, 0.7, 0.8]] |
| ) |
| |
| |
| step2 = ReasoningStep( |
| step_id=1, |
| step_type=StepType.LOCALIZATION, |
| description="I focus on the left side of the basket", |
| confidence=0.92, |
| dependencies=[0], |
| region_ids=[2], |
| bounding_boxes=[[0.3, 0.2, 0.5, 0.8]] |
| ) |
| |
| |
| step3 = ReasoningStep( |
| step_id=2, |
| step_type=StepType.PERCEPTION, |
| description="I identify red-colored apples in this region", |
| confidence=0.90, |
| dependencies=[1], |
| region_ids=[3, 4, 5], |
| bounding_boxes=[ |
| [0.32, 0.25, 0.42, 0.35], |
| [0.35, 0.40, 0.45, 0.50], |
| [0.38, 0.60, 0.48, 0.70], |
| ] |
| ) |
| |
| |
| step4 = ReasoningStep( |
| step_id=3, |
| step_type=StepType.COUNTING, |
| description="Counting red apples in regions 3, 4, 5: Total of 3 apples", |
| confidence=0.88, |
| dependencies=[2], |
| region_ids=[3, 4, 5] |
| ) |
| |
| |
| chain = ReasoningChain( |
| chain_id="cos_example_001", |
| image_path="data/basket_apples.jpg", |
| prompt="How many red apples are on the left side of the basket?", |
| steps=[step1, step2, step3, step4], |
| final_answer="There are 3 red apples on the left side of the basket", |
| is_correct=True |
| ) |
| |
| print("\nπ Question:", chain.prompt) |
| print("\nπ Reasoning Steps with Visual Grounding:") |
| print("-" * 70) |
| |
| for i, step in enumerate(chain.steps, 1): |
| print(f"\nStep {i} ({step.step_type.value}):") |
| print(f" Description: {step.description}") |
| print(f" Confidence: {step.confidence:.2f}") |
| |
| if step.region_ids: |
| print(f" Regions: {', '.join([f'<region{r}>' for r in step.region_ids])}") |
| |
| if step.bounding_boxes: |
| print(f" Bounding Boxes: {len(step.bounding_boxes)} box(es)") |
| for j, bbox in enumerate(step.bounding_boxes, 1): |
| print(f" Box {j}: [{bbox[0]:.3f}, {bbox[1]:.3f}, {bbox[2]:.3f}, {bbox[3]:.3f}]") |
| |
| if step.dependencies: |
| print(f" Depends on: Step(s) {', '.join(map(str, step.dependencies))}") |
| |
| print(f"\nβ
Final Answer: {chain.final_answer}") |
| |
| |
| print("\n" + "=" * 70) |
| print("FORMATTED OUTPUT (with special tokens):") |
| print("=" * 70) |
| formatted = chain.format_with_tokens() |
| print(formatted) |
| print() |
| |
| return chain |
|
|
|
|
| def example_spatial_comparison(): |
| """ |
| Example: Comparing objects in different regions. |
| |
| Question: "Is the red apple bigger than the green apple?" |
| """ |
| print("\n" + "=" * 70) |
| print("EXAMPLE 2: Spatial Comparison with Regions") |
| print("=" * 70) |
| |
| |
| step1 = ReasoningStep( |
| step_id=0, |
| step_type=StepType.LOCALIZATION, |
| description="I locate the red apple in the upper left region", |
| confidence=0.93, |
| region_ids=[1], |
| bounding_boxes=[[0.1, 0.1, 0.3, 0.3]] |
| ) |
| |
| |
| step2 = ReasoningStep( |
| step_id=1, |
| step_type=StepType.LOCALIZATION, |
| description="I locate the green apple in the lower right region", |
| confidence=0.91, |
| region_ids=[2], |
| bounding_boxes=[[0.6, 0.6, 0.8, 0.8]] |
| ) |
| |
| |
| step3 = ReasoningStep( |
| step_id=2, |
| step_type=StepType.COMPARISON, |
| description="Comparing bounding box sizes: red apple area β 0.04, green apple area β 0.04, approximately equal", |
| confidence=0.87, |
| dependencies=[0, 1], |
| region_ids=[1, 2] |
| ) |
| |
| |
| step4 = ReasoningStep( |
| step_id=3, |
| step_type=StepType.INFERENCE, |
| description="Based on similar bounding box areas, the apples are approximately the same size", |
| confidence=0.85, |
| dependencies=[2] |
| ) |
| |
| chain = ReasoningChain( |
| chain_id="cos_example_002", |
| image_path="data/two_apples.jpg", |
| prompt="Is the red apple bigger than the green apple?", |
| steps=[step1, step2, step3, step4], |
| final_answer="No, the red apple and green apple are approximately the same size", |
| is_correct=True |
| ) |
| |
| print("\nπ Question:", chain.prompt) |
| print("\nπ Reasoning with Region Comparison:") |
| print("-" * 70) |
| |
| for i, step in enumerate(chain.steps, 1): |
| print(f"\nStep {i} ({step.step_type.value}):") |
| print(f" {step.description}") |
| if step.region_ids: |
| print(f" β References: {', '.join([f'<region{r}>' for r in step.region_ids])}") |
| |
| print(f"\nβ
Answer: {chain.final_answer}") |
| print("\nFormatted:") |
| print(chain.format_with_tokens()) |
| print() |
| |
| return chain |
|
|
|
|
| def example_multi_region_composition(): |
| """ |
| Example: Compositional reasoning across multiple regions. |
| |
| Question: "Describe the arrangement of fruits in the bowl" |
| """ |
| print("\n" + "=" * 70) |
| print("EXAMPLE 3: Multi-Region Compositional Reasoning") |
| print("=" * 70) |
| |
| steps = [ |
| ReasoningStep( |
| step_id=0, |
| step_type=StepType.PERCEPTION, |
| description="I observe a ceramic bowl containing multiple types of fruit", |
| confidence=0.96, |
| region_ids=[1], |
| bounding_boxes=[[0.2, 0.2, 0.8, 0.8]] |
| ), |
| ReasoningStep( |
| step_id=1, |
| step_type=StepType.LOCALIZATION, |
| description="Top layer: three oranges arranged in a triangle", |
| confidence=0.92, |
| dependencies=[0], |
| region_ids=[2, 3, 4], |
| bounding_boxes=[ |
| [0.35, 0.25, 0.45, 0.35], |
| [0.50, 0.25, 0.60, 0.35], |
| [0.42, 0.35, 0.52, 0.45], |
| ] |
| ), |
| ReasoningStep( |
| step_id=2, |
| step_type=StepType.LOCALIZATION, |
| description="Middle layer: two apples positioned side by side", |
| confidence=0.90, |
| dependencies=[0], |
| region_ids=[5, 6], |
| bounding_boxes=[ |
| [0.30, 0.50, 0.40, 0.60], |
| [0.55, 0.50, 0.65, 0.60], |
| ] |
| ), |
| ReasoningStep( |
| step_id=3, |
| step_type=StepType.LOCALIZATION, |
| description="Bottom layer: one banana resting at the base", |
| confidence=0.88, |
| dependencies=[0], |
| region_ids=[7], |
| bounding_boxes=[[0.35, 0.65, 0.60, 0.75]] |
| ), |
| ReasoningStep( |
| step_id=4, |
| step_type=StepType.COMPOSITION, |
| description="The fruits are arranged in three distinct layers: oranges on top, apples in middle, banana at bottom", |
| confidence=0.85, |
| dependencies=[1, 2, 3], |
| region_ids=[2, 3, 4, 5, 6, 7] |
| ), |
| ] |
| |
| chain = ReasoningChain( |
| chain_id="cos_example_003", |
| image_path="data/fruit_bowl.jpg", |
| prompt="Describe the arrangement of fruits in the bowl", |
| steps=steps, |
| final_answer="The bowl contains fruits arranged in three layers: three oranges on top, two apples in the middle, and one banana at the bottom", |
| is_correct=True |
| ) |
| |
| print("\nπ Question:", chain.prompt) |
| print("\nπ Multi-Region Compositional Analysis:") |
| print("-" * 70) |
| |
| for step in chain.steps: |
| print(f"\nStep {step.step_id} ({step.step_type.value}):") |
| print(f" {step.description}") |
| if step.region_ids: |
| print(f" β Regions: {len(step.region_ids)} region(s) - {step.region_ids}") |
| if step.bounding_boxes: |
| print(f" β Bboxes: {len(step.bounding_boxes)} box(es)") |
| |
| print(f"\nβ
Answer: {chain.final_answer}") |
| print() |
| |
| return chain |
|
|
|
|
| def demonstrate_region_token_benefits(): |
| """Show benefits of Chain-of-Sight approach.""" |
| print("\n" + "=" * 70) |
| print("BENEFITS OF CHAIN-OF-SIGHT REGION TOKENS") |
| print("=" * 70) |
| |
| benefits = [ |
| ("π― Spatial Grounding", "Explicit reference to image regions with <region1>, <region2>, etc."), |
| ("π Precise Localization", "Bounding boxes provide exact spatial coordinates"), |
| ("π Region Relationships", "Steps can reference and relate multiple regions"), |
| ("π§© Compositional Reasoning", "Build understanding by composing information from regions"), |
| ("β
Verifiable", "Can validate reasoning by checking bbox alignment with claims"), |
| ("π Reusable", "Regions established early can be referenced in later steps"), |
| ("π¨ Visual Attention", "Makes model's visual focus explicit and interpretable"), |
| ] |
| |
| for title, desc in benefits: |
| print(f"\n{title}") |
| print(f" β {desc}") |
| |
| print("\n" + "=" * 70) |
| print("INTEGRATION WITH YOUR FRAMEWORK") |
| print("=" * 70) |
| |
| integration = [ |
| "β
Region tokens (<region1>-<region10>) added to SPECIAL_TOKENS", |
| "β
Bounding box support in ReasoningStep", |
| "β
Automatic formatting with <|region|> and <|bbox|> markers", |
| "β
Compatible with PRM evaluation (regions provide grounding)", |
| "β
Works with RL training (region-aware rewards)", |
| "β
Enhances inference-time scaling (verify region consistency)", |
| ] |
| |
| for item in integration: |
| print(f" {item}") |
| |
| print() |
|
|
|
|
| def show_token_list(): |
| """Display all available tokens including region tokens.""" |
| print("\n" + "=" * 70) |
| print("COMPLETE TOKEN LIST (with Chain-of-Sight additions)") |
| print("=" * 70) |
| |
| print(f"\nπ Total tokens: {len(SPECIAL_TOKENS)}") |
| print("\nπ€ All special tokens:") |
| for i, token in enumerate(SPECIAL_TOKENS, 1): |
| print(f" {i:2d}. {token}") |
| |
| print("\nπ¨ Region tokens specifically:") |
| for token in REGION_TOKENS: |
| print(f" β’ {token}") |
| |
| print() |
|
|
|
|
| def main(): |
| """Run all Chain-of-Sight examples.""" |
| print("\n" + "=" * 70) |
| print("CHAIN-OF-SIGHT INTEGRATION WITH STEP-LEVEL COT") |
| print("Inspired by: https://github.com/baaivision/CoS") |
| print("=" * 70) |
| |
| |
| show_token_list() |
| |
| |
| chain1 = example_visual_grounding() |
| chain2 = example_spatial_comparison() |
| chain3 = example_multi_region_composition() |
| |
| |
| demonstrate_region_token_benefits() |
| |
| print("\n" + "=" * 70) |
| print("SUMMARY") |
| print("=" * 70) |
| print(f"β
Generated {3} example reasoning chains") |
| print(f"β
Demonstrated visual grounding with region tokens") |
| print(f"β
Showed bounding box integration") |
| print(f"β
Examples saved to chains: {[chain1.chain_id, chain2.chain_id, chain3.chain_id]}") |
| print("\nπ‘ Use these patterns for training vision-language models with spatial reasoning!") |
| print() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|