dei-model / utils /chain_of_sight_example.py

renpas22

Add utils directory

da76488 3 months ago

13 kB

	"""
	Chain-of-Sight (CoS) Integration Example

	Demonstrates visual grounding with region tokens and bounding boxes.
	Inspired by: https://github.com/baaivision/CoS

	This shows how to use region tokens for spatially-aware reasoning.
	"""

	import sys
	from pathlib import Path
	from dataclasses import dataclass, field
	from typing import List, Dict, Any, Optional
	from enum import Enum

	# Direct imports to avoid accelerate dependency
	sys.path.insert(0, str(Path(__file__).parent.parent))

	# Import just what we need without loading the full framework
	import importlib.util
	spec = importlib.util.spec_from_file_location(
	"step_data",
	Path(__file__).parent.parent / "src" / "reasoning" / "step_data.py"
	)
	step_data = importlib.util.module_from_spec(spec)

	# Inject minimal dependencies
	import torch
	step_data.torch = torch

	# Load the module
	spec.loader.exec_module(step_data)

	# Import what we need
	ReasoningStep = step_data.ReasoningStep
	ReasoningChain = step_data.ReasoningChain
	StepType = step_data.StepType
	SPECIAL_TOKENS = step_data.SPECIAL_TOKENS
	REGION_TOKENS = step_data.REGION_TOKENS


	def example_visual_grounding():
	"""
	Example: Visual grounding with region tokens.

	Question: "How many red apples are on the left side of the basket?"
	"""
	print("=" * 70)
	print("EXAMPLE 1: Visual Grounding with Region Tokens (Chain-of-Sight)")
	print("=" * 70)

	# Step 1: Perceive and localize the basket
	step1 = ReasoningStep(
	step_id=0,
	step_type=StepType.PERCEPTION,
	description="I observe a woven basket in the center of the image",
	confidence=0.95,
	region_ids=[1], # Refers to <region1>
	bounding_boxes=[[0.3, 0.2, 0.7, 0.8]] # [x1, y1, x2, y2] normalized coords
	)

	# Step 2: Localize left side
	step2 = ReasoningStep(
	step_id=1,
	step_type=StepType.LOCALIZATION,
	description="I focus on the left side of the basket",
	confidence=0.92,
	dependencies=[0],
	region_ids=[2], # Refers to <region2>
	bounding_boxes=[[0.3, 0.2, 0.5, 0.8]] # Left half of basket
	)

	# Step 3: Identify red apples
	step3 = ReasoningStep(
	step_id=2,
	step_type=StepType.PERCEPTION,
	description="I identify red-colored apples in this region",
	confidence=0.90,
	dependencies=[1],
	region_ids=[3, 4, 5], # Three apple regions
	bounding_boxes=[
	[0.32, 0.25, 0.42, 0.35], # Apple 1
	[0.35, 0.40, 0.45, 0.50], # Apple 2
	[0.38, 0.60, 0.48, 0.70], # Apple 3
	]
	)

	# Step 4: Count the apples
	step4 = ReasoningStep(
	step_id=3,
	step_type=StepType.COUNTING,
	description="Counting red apples in regions 3, 4, 5: Total of 3 apples",
	confidence=0.88,
	dependencies=[2],
	region_ids=[3, 4, 5]
	)

	# Create reasoning chain
	chain = ReasoningChain(
	chain_id="cos_example_001",
	image_path="data/basket_apples.jpg",
	prompt="How many red apples are on the left side of the basket?",
	steps=[step1, step2, step3, step4],
	final_answer="There are 3 red apples on the left side of the basket",
	is_correct=True
	)

	print("\n📝 Question:", chain.prompt)
	print("\n🔍 Reasoning Steps with Visual Grounding:")
	print("-" * 70)

	for i, step in enumerate(chain.steps, 1):
	print(f"\nStep {i} ({step.step_type.value}):")
	print(f" Description: {step.description}")
	print(f" Confidence: {step.confidence:.2f}")

	if step.region_ids:
	print(f" Regions: {', '.join([f'<region{r}>' for r in step.region_ids])}")

	if step.bounding_boxes:
	print(f" Bounding Boxes: {len(step.bounding_boxes)} box(es)")
	for j, bbox in enumerate(step.bounding_boxes, 1):
	print(f" Box {j}: [{bbox[0]:.3f}, {bbox[1]:.3f}, {bbox[2]:.3f}, {bbox[3]:.3f}]")

	if step.dependencies:
	print(f" Depends on: Step(s) {', '.join(map(str, step.dependencies))}")

	print(f"\n✅ Final Answer: {chain.final_answer}")

	# Show formatted output with special tokens
	print("\n" + "=" * 70)
	print("FORMATTED OUTPUT (with special tokens):")
	print("=" * 70)
	formatted = chain.format_with_tokens()
	print(formatted)
	print()

	return chain


	def example_spatial_comparison():
	"""
	Example: Comparing objects in different regions.

	Question: "Is the red apple bigger than the green apple?"
	"""
	print("\n" + "=" * 70)
	print("EXAMPLE 2: Spatial Comparison with Regions")
	print("=" * 70)

	# Step 1: Locate red apple
	step1 = ReasoningStep(
	step_id=0,
	step_type=StepType.LOCALIZATION,
	description="I locate the red apple in the upper left region",
	confidence=0.93,
	region_ids=[1],
	bounding_boxes=[[0.1, 0.1, 0.3, 0.3]]
	)

	# Step 2: Locate green apple
	step2 = ReasoningStep(
	step_id=1,
	step_type=StepType.LOCALIZATION,
	description="I locate the green apple in the lower right region",
	confidence=0.91,
	region_ids=[2],
	bounding_boxes=[[0.6, 0.6, 0.8, 0.8]]
	)

	# Step 3: Compare sizes
	step3 = ReasoningStep(
	step_id=2,
	step_type=StepType.COMPARISON,
	description="Comparing bounding box sizes: red apple area ≈ 0.04, green apple area ≈ 0.04, approximately equal",
	confidence=0.87,
	dependencies=[0, 1],
	region_ids=[1, 2]
	)

	# Step 4: Inference
	step4 = ReasoningStep(
	step_id=3,
	step_type=StepType.INFERENCE,
	description="Based on similar bounding box areas, the apples are approximately the same size",
	confidence=0.85,
	dependencies=[2]
	)

	chain = ReasoningChain(
	chain_id="cos_example_002",
	image_path="data/two_apples.jpg",
	prompt="Is the red apple bigger than the green apple?",
	steps=[step1, step2, step3, step4],
	final_answer="No, the red apple and green apple are approximately the same size",
	is_correct=True
	)

	print("\n📝 Question:", chain.prompt)
	print("\n🔍 Reasoning with Region Comparison:")
	print("-" * 70)

	for i, step in enumerate(chain.steps, 1):
	print(f"\nStep {i} ({step.step_type.value}):")
	print(f" {step.description}")
	if step.region_ids:
	print(f" → References: {', '.join([f'<region{r}>' for r in step.region_ids])}")

	print(f"\n✅ Answer: {chain.final_answer}")
	print("\nFormatted:")
	print(chain.format_with_tokens())
	print()

	return chain


	def example_multi_region_composition():
	"""
	Example: Compositional reasoning across multiple regions.

	Question: "Describe the arrangement of fruits in the bowl"
	"""
	print("\n" + "=" * 70)
	print("EXAMPLE 3: Multi-Region Compositional Reasoning")
	print("=" * 70)

	steps = [
	ReasoningStep(
	step_id=0,
	step_type=StepType.PERCEPTION,
	description="I observe a ceramic bowl containing multiple types of fruit",
	confidence=0.96,
	region_ids=[1],
	bounding_boxes=[[0.2, 0.2, 0.8, 0.8]]
	),
	ReasoningStep(
	step_id=1,
	step_type=StepType.LOCALIZATION,
	description="Top layer: three oranges arranged in a triangle",
	confidence=0.92,
	dependencies=[0],
	region_ids=[2, 3, 4],
	bounding_boxes=[
	[0.35, 0.25, 0.45, 0.35], # Orange 1
	[0.50, 0.25, 0.60, 0.35], # Orange 2
	[0.42, 0.35, 0.52, 0.45], # Orange 3
	]
	),
	ReasoningStep(
	step_id=2,
	step_type=StepType.LOCALIZATION,
	description="Middle layer: two apples positioned side by side",
	confidence=0.90,
	dependencies=[0],
	region_ids=[5, 6],
	bounding_boxes=[
	[0.30, 0.50, 0.40, 0.60], # Apple 1
	[0.55, 0.50, 0.65, 0.60], # Apple 2
	]
	),
	ReasoningStep(
	step_id=3,
	step_type=StepType.LOCALIZATION,
	description="Bottom layer: one banana resting at the base",
	confidence=0.88,
	dependencies=[0],
	region_ids=[7],
	bounding_boxes=[[0.35, 0.65, 0.60, 0.75]]
	),
	ReasoningStep(
	step_id=4,
	step_type=StepType.COMPOSITION,
	description="The fruits are arranged in three distinct layers: oranges on top, apples in middle, banana at bottom",
	confidence=0.85,
	dependencies=[1, 2, 3],
	region_ids=[2, 3, 4, 5, 6, 7]
	),
	]

	chain = ReasoningChain(
	chain_id="cos_example_003",
	image_path="data/fruit_bowl.jpg",
	prompt="Describe the arrangement of fruits in the bowl",
	steps=steps,
	final_answer="The bowl contains fruits arranged in three layers: three oranges on top, two apples in the middle, and one banana at the bottom",
	is_correct=True
	)

	print("\n📝 Question:", chain.prompt)
	print("\n🔍 Multi-Region Compositional Analysis:")
	print("-" * 70)

	for step in chain.steps:
	print(f"\nStep {step.step_id} ({step.step_type.value}):")
	print(f" {step.description}")
	if step.region_ids:
	print(f" → Regions: {len(step.region_ids)} region(s) - {step.region_ids}")
	if step.bounding_boxes:
	print(f" → Bboxes: {len(step.bounding_boxes)} box(es)")

	print(f"\n✅ Answer: {chain.final_answer}")
	print()

	return chain


	def demonstrate_region_token_benefits():
	"""Show benefits of Chain-of-Sight approach."""
	print("\n" + "=" * 70)
	print("BENEFITS OF CHAIN-OF-SIGHT REGION TOKENS")
	print("=" * 70)

	benefits = [
	("🎯 Spatial Grounding", "Explicit reference to image regions with <region1>, <region2>, etc."),
	("📍 Precise Localization", "Bounding boxes provide exact spatial coordinates"),
	("🔗 Region Relationships", "Steps can reference and relate multiple regions"),
	("🧩 Compositional Reasoning", "Build understanding by composing information from regions"),
	("✅ Verifiable", "Can validate reasoning by checking bbox alignment with claims"),
	("🔄 Reusable", "Regions established early can be referenced in later steps"),
	("🎨 Visual Attention", "Makes model's visual focus explicit and interpretable"),
	]

	for title, desc in benefits:
	print(f"\n{title}")
	print(f" → {desc}")

	print("\n" + "=" * 70)
	print("INTEGRATION WITH YOUR FRAMEWORK")
	print("=" * 70)

	integration = [
	"✅ Region tokens (<region1>-<region10>) added to SPECIAL_TOKENS",
	"✅ Bounding box support in ReasoningStep",
	"✅ Automatic formatting with <\|region\|> and <\|bbox\|> markers",
	"✅ Compatible with PRM evaluation (regions provide grounding)",
	"✅ Works with RL training (region-aware rewards)",
	"✅ Enhances inference-time scaling (verify region consistency)",
	]

	for item in integration:
	print(f" {item}")

	print()


	def show_token_list():
	"""Display all available tokens including region tokens."""
	print("\n" + "=" * 70)
	print("COMPLETE TOKEN LIST (with Chain-of-Sight additions)")
	print("=" * 70)

	print(f"\n📋 Total tokens: {len(SPECIAL_TOKENS)}")
	print("\n🔤 All special tokens:")
	for i, token in enumerate(SPECIAL_TOKENS, 1):
	print(f" {i:2d}. {token}")

	print("\n🎨 Region tokens specifically:")
	for token in REGION_TOKENS:
	print(f" • {token}")

	print()


	def main():
	"""Run all Chain-of-Sight examples."""
	print("\n" + "=" * 70)
	print("CHAIN-OF-SIGHT INTEGRATION WITH STEP-LEVEL COT")
	print("Inspired by: https://github.com/baaivision/CoS")
	print("=" * 70)

	# Show token list
	show_token_list()

	# Run examples
	chain1 = example_visual_grounding()
	chain2 = example_spatial_comparison()
	chain3 = example_multi_region_composition()

	# Show benefits
	demonstrate_region_token_benefits()

	print("\n" + "=" * 70)
	print("SUMMARY")
	print("=" * 70)
	print(f"✅ Generated {3} example reasoning chains")
	print(f"✅ Demonstrated visual grounding with region tokens")
	print(f"✅ Showed bounding box integration")
	print(f"✅ Examples saved to chains: {[chain1.chain_id, chain2.chain_id, chain3.chain_id]}")
	print("\n💡 Use these patterns for training vision-language models with spatial reasoning!")
	print()


	if __name__ == "__main__":
	main()