""" generate_synthetic.py ---------------------- Generates realistic synthetic MNREGA district-level data for Maharashtra. Mimics the structure of real data available from: - nregarep1.nic.in (MoRD official portal) - dataful.in (district-wise persondays + expenditure) Columns produced match what you'd get from real sources: state, district, financial_year, households_demanded, households_offered, households_availed, person_days, expenditure_lakhs, avg_wage_rate, works_completed Design principles for realism: - Each district has a stable "base capacity" (some districts are structurally larger / more active than others) - Year-on-year growth follows real MNREGA trends (spike in 2020-21 due to COVID reverse migration, slowdown in urban-adjacent districts) - Expenditure correlates with person_days but has noise (efficiency varies) - Wage rate increases over years (matches real wage revision schedule) - ~8% missing values injected randomly to simulate real data quality """ import numpy as np import pandas as pd import os # ── Maharashtra districts (all 36) ─────────────────────────────────────────── MAHARASHTRA_DISTRICTS = [ "Ahmednagar", "Akola", "Amravati", "Aurangabad", "Beed", "Bhandara", "Buldhana", "Chandrapur", "Dhule", "Gadchiroli", "Gondia", "Hingoli", "Jalgaon", "Jalna", "Kolhapur", "Latur", "Mumbai City", "Mumbai Suburban", "Nagpur", "Nanded", "Nandurbar", "Nashik", "Osmanabad", "Palghar", "Parbhani", "Pune", "Raigad", "Ratnagiri", "Sangli", "Satara", "Sindhudurg", "Solapur", "Thane", "Wardha", "Washim", "Yavatmal" ] YEARS = [ "2014-15", "2015-16", "2016-17", "2017-18", "2018-19", "2019-20", "2020-21", "2021-22", "2022-23", "2023-24" ] # Real MNREGA wage rates in Maharashtra (approx ₹/day by year) WAGE_RATES = { "2014-15": 162, "2015-16": 174, "2016-17": 183, "2017-18": 194, "2018-19": 203, "2019-20": 213, "2020-21": 238, "2021-22": 256, "2022-23": 273, "2023-24": 289 } # Year-level demand multipliers based on real MNREGA trends # COVID year (2020-21) saw massive spike due to reverse migration YEAR_MULTIPLIERS = { "2014-15": 0.85, "2015-16": 0.90, "2016-17": 0.92, "2017-18": 0.95, "2018-19": 1.00, "2019-20": 1.05, "2020-21": 1.45, "2021-22": 1.20, "2022-23": 1.10, "2023-24": 1.08 } # District profile: (base_persondays_lakhs, efficiency_score, rural_weight) # Urban/peri-urban districts have lower base; tribal/rural have higher DISTRICT_PROFILES = { "Gadchiroli": (18.5, 0.72, 0.95), "Nandurbar": (16.2, 0.68, 0.93), "Yavatmal": (15.8, 0.74, 0.91), "Amravati": (14.3, 0.76, 0.88), "Chandrapur": (13.9, 0.71, 0.87), "Washim": (12.1, 0.73, 0.89), "Buldhana": (11.8, 0.75, 0.86), "Beed": (11.5, 0.70, 0.90), "Hingoli": (10.9, 0.72, 0.88), "Osmanabad": (10.7, 0.69, 0.87), "Latur": (10.4, 0.71, 0.85), "Nanded": (10.2, 0.73, 0.84), "Jalna": (9.8, 0.74, 0.85), "Parbhani": (9.5, 0.72, 0.84), "Akola": (9.3, 0.75, 0.83), "Dhule": (9.1, 0.70, 0.85), "Gondia": (8.9, 0.76, 0.82), "Bhandara": (8.6, 0.74, 0.81), "Wardha": (8.3, 0.77, 0.80), "Ahmednagar": (8.1, 0.78, 0.79), "Solapur": (7.9, 0.76, 0.80), "Aurangabad": (7.6, 0.79, 0.75), "Jalgaon": (7.4, 0.77, 0.77), "Nashik": (7.1, 0.80, 0.73), "Satara": (6.8, 0.81, 0.74), "Sangli": (6.5, 0.80, 0.73), "Kolhapur": (6.2, 0.82, 0.71), "Palghar": (6.0, 0.75, 0.78), "Nandurbar": (5.8, 0.71, 0.82), "Ratnagiri": (5.5, 0.79, 0.74), "Sindhudurg": (5.1, 0.80, 0.72), "Raigad": (4.8, 0.78, 0.68), "Pune": (4.2, 0.83, 0.55), "Thane": (3.5, 0.81, 0.45), "Mumbai Suburban": (1.2, 0.85, 0.15), "Mumbai City": (0.4, 0.88, 0.05), } def generate(seed: int = 42, missing_rate: float = 0.08) -> pd.DataFrame: """ Generate a synthetic MNREGA dataset for Maharashtra. Args: seed : Random seed for reproducibility. missing_rate: Fraction of cells to nullify (simulates real data gaps). Returns: DataFrame with realistic MNREGA data. """ rng = np.random.default_rng(seed) records = [] for district in MAHARASHTRA_DISTRICTS: profile = DISTRICT_PROFILES.get(district, (7.0, 0.75, 0.70)) base_pd, efficiency, rural_w = profile for year in YEARS: year_mult = YEAR_MULTIPLIERS[year] wage = WAGE_RATES[year] # ── Person days (in lakhs) ──────────────────────────────────── noise = rng.normal(1.0, 0.07) person_days_lakhs = base_pd * year_mult * noise person_days_lakhs = max(person_days_lakhs, 0.1) # ── Households ─────────────────────────────────────────────── # Avg ~45 days per household → households = person_days / 45 hh_demanded = int(person_days_lakhs * 1e5 / 38 * rng.uniform(1.05, 1.15)) hh_offered = int(hh_demanded * rng.uniform(0.92, 0.99)) hh_availed = int(hh_offered * rng.uniform(0.88, 0.97)) # ── Expenditure (₹ lakhs) ──────────────────────────────────── # Base = person_days * wage_rate, efficiency introduces noise base_expenditure = person_days_lakhs * 1e5 * wage / 1e5 expenditure_lakhs = base_expenditure / efficiency * rng.uniform(0.93, 1.07) # ── Works completed ────────────────────────────────────────── works = int(person_days_lakhs * rng.uniform(18, 35)) records.append({ "state": "Maharashtra", "district": district, "financial_year": year, "households_demanded": hh_demanded, "households_offered": hh_offered, "households_availed": hh_availed, "person_days_lakhs": round(person_days_lakhs, 3), "expenditure_lakhs": round(expenditure_lakhs, 2), "avg_wage_rate": wage, "works_completed": works, }) df = pd.DataFrame(records) # ── Inject realistic missing values ────────────────────────────────────── nullable_cols = [ "households_demanded", "households_offered", "households_availed", "works_completed" ] for col in nullable_cols: mask = rng.random(len(df)) < missing_rate df.loc[mask, col] = np.nan print(f"[generate] Created {len(df)} rows × {len(df.columns)} columns") print(f"[generate] Districts: {df['district'].nunique()} | Years: {df['financial_year'].nunique()}") print(f"[generate] Missing values injected: ~{missing_rate*100:.0f}% per nullable column") return df def save(df: pd.DataFrame, path: str = "data/raw/mnrega_maharashtra_synthetic.csv") -> None: os.makedirs(os.path.dirname(path), exist_ok=True) df.to_csv(path, index=False) print(f"[generate] Saved → {path}") if __name__ == "__main__": df = generate() save(df) print("\nSample:") print(df.head(6).to_string(index=False))