File size: 4,084 Bytes
78a0ca9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import pandas as pd
from typing import List, Optional, Dict, Any
from dataclasses import dataclass
import data_designer.config as dd
from data_designer.interface import DataDesigner

@dataclass
class SyntheticDataConfig:
    name: str = "synthetic_dataset"
    num_records: int = 10
    topics: List[str] = None
    prompt_template: str = "Create a high-quality instruction and response pair for the topic: {{ topic }}."
    model_alias: str = "perplexity-text"
    output_path: str = "synthetic_data.jsonl"

class DataPreparer:
    def __init__(self, designer: Optional[DataDesigner] = None):
        if not designer:
            # Configure Perplexity provider (OpenAI-compatible)
            perplexity_provider = dd.ModelProvider(
                name="perplexity",
                provider_type="openai",
                api_key="PERPLEXITY_API_KEY",
                endpoint="https://api.perplexity.ai"
            )
            designer = DataDesigner(
                model_providers=[perplexity_provider]
            )
        self.designer = designer

    def generate_synthetic_data(self, config: SyntheticDataConfig) -> pd.DataFrame:
        print(f"Generating {config.num_records} synthetic records for topics: {config.topics}")
        
        # Configure model
        perplexity_model = dd.ModelConfig(
            alias="perplexity-text",
            model="sonar",
            provider="perplexity",
            inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1)
        )
        builder = dd.DataDesignerConfigBuilder(model_configs=[perplexity_model])
        
        # Add topic sampler
        if config.topics:
            builder.add_column(
                dd.SamplerColumnConfig(
                    name="topic",
                    sampler_type=dd.SamplerType.CATEGORY,
                    params=dd.CategorySamplerParams(values=config.topics)
                )
            )
        else:
            # Default topics if none provided
            builder.add_column(
                dd.SamplerColumnConfig(
                    name="topic",
                    sampler_type=dd.SamplerType.CATEGORY,
                    params=dd.CategorySamplerParams(values=["Python Programming", "Data Science", "Machine Learning"])
                )
            )

        # Add LLM Structured column for Instruction/Response pairs
        builder.add_column(
            dd.LLMTextColumnConfig(
                name="instruction",
                model_alias=config.model_alias,
                prompt=f"{config.prompt_template}\n\nReturn only the instruction part."
            )
        )
        
        builder.add_column(
            dd.LLMTextColumnConfig(
                name="output",
                model_alias=config.model_alias,
                prompt="Based on the instruction: {{ instruction }}, provide a detailed and accurate response."
            )
        )

        # Run generation
        result = self.designer.create(config_builder=builder, num_records=config.num_records)
        df = result.load_dataset()
        
        # Save to JSONL
        df.to_json(config.output_path, orient="records", lines=True)
        print(f"Synthetic data saved to {config.output_path}")
        
        return df

    def format_for_qwen(self, df: pd.DataFrame) -> List[Dict[str, str]]:
        """Formats the dataframe into ChatML for Qwen training."""
        chatml_data = []
        for _, row in df.iterrows():
            chatml_data.append({
                "text": f"<|im_start|>user\n{row['instruction']}<|im_end|>\n<|im_start|>assistant\n{row['output']}<|im_end|>"
            })
        return chatml_data

if __name__ == "__main__":
    # Example usage
    config = SyntheticDataConfig(
        num_records=10,
        topics=["Quantum Computing", "Space Exploration"],
        output_path="test_synthetic.jsonl"
    )
    preparer = DataPreparer()
    df = preparer.generate_synthetic_data(config)
    formatted = preparer.format_for_qwen(df)
    print(f"Formatted {len(formatted)} records for Qwen.")