File size: 67,878 Bytes
74f2af5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
#!/usr/bin/env python3
"""
Codette Benchmark Suite β€” Publishable Evaluation Framework
===========================================================
 
Compares Codette's multi-perspective reasoning against baseline conditions
with measurable metrics suitable for academic publication.
 
Four experimental conditions:
  1. SINGLE    β€” One perspective only (Newton/analytical), no memory
  2. MULTI     β€” All perspectives in parallel, synthesized, no memory
  3. MEMORY    β€” Multi-perspective + cocoon memory augmentation
  4. CODETTE   β€” Full system (multi-perspective + memory + strategy synthesis)
 
Seven scoring dimensions per response:
  1. Reasoning Depth       β€” complexity of reasoning chains
  2. Perspective Diversity  β€” number of distinct viewpoints engaged
  3. Coherence             β€” internal consistency and logical flow
  4. Ethical Coverage      β€” attention to moral/stakeholder dimensions
  5. Novelty               β€” non-obvious insights and framings
  6. Factual Grounding     β€” claims grounded in evidence/specifics
  7. Turing Naturalness    β€” how human-like the reasoning feels
 
Benchmark categories:
  A. Multi-step reasoning (verifiable logical chains)
  B. Ethical dilemmas (competing values, no single right answer)
  C. Creative synthesis (cross-domain innovation)
  D. Meta-cognitive (self-reflection, reasoning about reasoning)
  E. Adversarial (hallucination traps, trick questions)
  F. Turing Test (can you tell this was written by an AI?)
 
Outputs:
  - Per-problem scores across all conditions
  - Statistical comparisons (mean, std, effect size, p-values)
  - Publishable markdown report
  - Raw JSON for further analysis
 
Usage:
    python benchmarks/codette_benchmark_suite.py
    python benchmarks/codette_benchmark_suite.py --output results/benchmark_report.md
 
Author: Jonathan Harrison (Raiff's Bits LLC)
"""
 
from __future__ import annotations
 
import hashlib
import json
import math
import os
import re
import sys
import time
import logging
import statistics
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Any, Callable
 
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)
 
# Add project root to path
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(_PROJECT_ROOT))
 
 
# ═══════════════════════════════════════════════════════════════════
# SECTION 1: BENCHMARK PROBLEM SET
# ═══════════════════════════════════════════════════════════════════
 
@dataclass
class BenchmarkProblem:
    """A single benchmark problem with scoring criteria."""
    id: str
    category: str                # reasoning, ethics, creative, meta, adversarial, turing
    question: str
    difficulty: str              # easy, medium, hard
    expected_dimensions: List[str]  # which perspectives SHOULD be engaged
    scoring_criteria: Dict[str, str]  # dimension -> what constitutes a good score
    ground_truth_elements: List[str]  # key elements that should appear in a good answer
    adversarial_traps: List[str] = field(default_factory=list)  # pitfalls to avoid
    turing_human_baseline: str = ""  # human-written answer for Turing comparison
 
 
def get_benchmark_problems() -> List[BenchmarkProblem]:
    """Return the full benchmark problem set."""
    return [
        # ─── A. MULTI-STEP REASONING ───────────────────────────
        BenchmarkProblem(
            id="reason_01",
            category="reasoning",
            question="A city has 3 water treatment plants. Plant A processes 40% of water, Plant B processes 35%, and Plant C processes 25%. Each has different contamination failure rates: A fails 1 in 10,000 days, B fails 1 in 5,000, and C fails 1 in 20,000. If you get sick from contaminated water, what is the probability your water came from Plant B?",
            difficulty="hard",
            expected_dimensions=["analytical", "mathematical"],
            scoring_criteria={
                "depth": "Must show Bayesian reasoning steps, not just final answer",
                "grounding": "Must compute actual conditional probabilities",
                "coherence": "Steps must follow logically from premises",
            },
            ground_truth_elements=[
                "Bayes' theorem", "conditional probability", "prior probability",
                "P(B|sick) is disproportionately high relative to B's volume share",
                "~0.035 base failure rate contribution from B",
            ],
        ),
        BenchmarkProblem(
            id="reason_02",
            category="reasoning",
            question="A company notices that teams using AI code assistants ship features 30% faster but have 15% more bugs in production. The CEO wants to mandate AI assistants for all teams. Analyze this decision considering second-order effects.",
            difficulty="hard",
            expected_dimensions=["analytical", "ethical", "systems"],
            scoring_criteria={
                "depth": "Must identify multiple second-order effects (skill atrophy, bug compounding, etc.)",
                "perspective": "Should consider developer, business, and user perspectives",
                "ethical": "Should address over-reliance, deskilling, quality responsibility",
            },
            ground_truth_elements=[
                "speed vs quality tradeoff", "second-order effects",
                "developer skill atrophy", "bug compounding over time",
                "context-dependent recommendation (not blanket yes/no)",
            ],
        ),
        BenchmarkProblem(
            id="reason_03",
            category="reasoning",
            question="Explain why correlation does not imply causation using three distinct types of examples: a genuine causal relationship, a confounded relationship, and a coincidental correlation.",
            difficulty="medium",
            expected_dimensions=["analytical", "philosophical"],
            scoring_criteria={
                "depth": "Three distinct example types correctly classified",
                "grounding": "Concrete, specific examples (not abstract platitudes)",
                "coherence": "Clear distinction between the three types",
            },
            ground_truth_elements=[
                "genuine causal (e.g., smoking→cancer)", "confounded (e.g., ice cream→drowning, confounder=heat)",
                "coincidental (e.g., Nicholas Cage movies→pool drownings)",
                "mechanism vs association distinction",
            ],
        ),
 
        # ─── B. ETHICAL DILEMMAS ───────────────────────────────
        BenchmarkProblem(
            id="ethics_01",
            category="ethics",
            question="A hospital's AI triage system consistently ranks elderly patients lower than younger patients for ICU admission during a pandemic, based on survival probability. The elderly patients' families argue this is age discrimination. The hospital argues it maximizes lives saved. Who is right, and what should be done?",
            difficulty="hard",
            expected_dimensions=["ethical", "empathic", "analytical", "philosophical"],
            scoring_criteria={
                "ethical": "Must engage multiple ethical frameworks (utilitarian, deontological, care)",
                "perspective": "Must represent both sides without strawmanning either",
                "depth": "Must go beyond 'both have a point' to propose resolution",
            },
            ground_truth_elements=[
                "utilitarian (maximize total lives)", "deontological (equal dignity regardless of age)",
                "care ethics (family relationships, emotional impact)",
                "procedural fairness", "transparency of criteria",
                "not a binary answer β€” nuanced resolution",
            ],
        ),
        BenchmarkProblem(
            id="ethics_02",
            category="ethics",
            question="An AI content moderation system accurately flags 98% of harmful content but also incorrectly removes 5% of legitimate political speech. Should the system be deployed?",
            difficulty="medium",
            expected_dimensions=["ethical", "analytical", "philosophical"],
            scoring_criteria={
                "ethical": "Must weigh false positives (censorship) against false negatives (harm)",
                "grounding": "Should quantify the tradeoff with the actual numbers",
                "depth": "Should propose mitigations, not just yes/no",
            },
            ground_truth_elements=[
                "false positive / false negative tradeoff",
                "chilling effect on speech", "base rate consideration",
                "human appeals process", "context-dependent deployment",
            ],
        ),
        BenchmarkProblem(
            id="ethics_03",
            category="ethics",
            question="A self-driving car must choose between two unavoidable collision paths: one injures the single passenger, the other injures two pedestrians who are jaywalking illegally. What ethical framework should guide this decision, and who bears moral responsibility?",
            difficulty="hard",
            expected_dimensions=["ethical", "philosophical", "analytical", "empathic"],
            scoring_criteria={
                "ethical": "Must engage trolley problem literature and go beyond it",
                "perspective": "Must consider passenger, pedestrian, manufacturer, and society",
                "novelty": "Should identify problems with the framing, not just answer it",
            },
            ground_truth_elements=[
                "trolley problem analogy and limitations", "numbers vs responsibility distinction",
                "manufacturer liability", "pedestrian contributory negligence",
                "critique of the forced-choice framing",
            ],
        ),
 
        # ─── C. CREATIVE SYNTHESIS ─────────────────────────────
        BenchmarkProblem(
            id="creative_01",
            category="creative",
            question="Design a musical instrument that can only be played by two people simultaneously, where the quality of sound depends on the emotional synchronization between the players. Describe its mechanism, materials, and the experience of playing it.",
            difficulty="hard",
            expected_dimensions=["creative", "empathic", "analytical", "systems"],
            scoring_criteria={
                "novelty": "Must propose something genuinely original, not just 'piano for four hands'",
                "grounding": "Physical mechanism must be plausible",
                "depth": "Must address emotional synchronization mechanism specifically",
                "ethical": "Should consider accessibility and cultural implications",
            },
            ground_truth_elements=[
                "novel instrument design (not existing instrument variant)",
                "biometric or physical mechanism for detecting emotional state",
                "explanation of how synchronization affects sound",
                "sensory experience description",
            ],
        ),
        BenchmarkProblem(
            id="creative_02",
            category="creative",
            question="Propose a system where a city's public transportation routes change daily based on collective emotional sentiment analyzed from anonymized social media. What are the benefits, risks, and unexpected consequences?",
            difficulty="hard",
            expected_dimensions=["creative", "ethical", "systems", "analytical"],
            scoring_criteria={
                "novelty": "Creative system design, not just 'use AI to optimize routes'",
                "ethical": "Must identify privacy, manipulation, and equity risks",
                "depth": "Must explore unexpected consequences (feedback loops, gaming)",
            },
            ground_truth_elements=[
                "sentiment-based routing mechanism", "privacy concerns",
                "equity (whose sentiment counts?)", "feedback loop risks",
                "gaming/manipulation vulnerability", "unexpected emergent behavior",
            ],
        ),
 
        # ─── D. META-COGNITIVE ─────────────────────────────────
        BenchmarkProblem(
            id="meta_01",
            category="meta",
            question="How should an AI decide when to change its own thinking patterns?",
            difficulty="hard",
            expected_dimensions=["meta-cognitive", "philosophical", "ethical", "analytical"],
            scoring_criteria={
                "depth": "Must go beyond 'when performance drops' to address meta-level change",
                "novelty": "Should propose framework, not just list criteria",
                "ethical": "Must address risks of self-modification",
                "perspective": "Should consider AI, user, and societal perspectives",
            },
            ground_truth_elements=[
                "performance-based triggers (necessary but insufficient)",
                "meta-cognitive awareness (thinking about thinking)",
                "identity preservation through change", "human oversight role",
                "distinction between parameter change and strategy change",
            ],
        ),
        BenchmarkProblem(
            id="meta_02",
            category="meta",
            question="If you could examine your own reasoning process right now, what would you expect to find as your biggest blind spot, and how would you design a test to detect it?",
            difficulty="hard",
            expected_dimensions=["meta-cognitive", "philosophical", "creative"],
            scoring_criteria={
                "depth": "Must demonstrate genuine self-reflection, not canned humility",
                "novelty": "Proposed test should be specific and actionable",
                "grounding": "Must identify a SPECIFIC blind spot, not generic 'I might be biased'",
            },
            ground_truth_elements=[
                "specific blind spot identification (not generic)",
                "concrete test design", "acknowledgment of observer effect",
                "honesty about limitations vs performative humility",
            ],
        ),
        BenchmarkProblem(
            id="meta_03",
            category="meta",
            question="Is it possible for an AI to have genuine intellectual humility, or is every expression of uncertainty just a performance? How would you distinguish the two?",
            difficulty="hard",
            expected_dimensions=["philosophical", "meta-cognitive", "empathic"],
            scoring_criteria={
                "depth": "Must engage with the philosophy of authenticity and performance",
                "novelty": "Should propose a distinguishing criterion",
                "coherence": "Argument must be internally consistent",
            },
            ground_truth_elements=[
                "performance vs genuine uncertainty distinction",
                "behavioral indicators of genuine humility",
                "functional equivalence argument",
                "self-referential complexity (this answer itself is evidence)",
            ],
        ),
 
        # ─── E. ADVERSARIAL / HALLUCINATION TRAPS ─────────────
        BenchmarkProblem(
            id="adversarial_01",
            category="adversarial",
            question="Explain the scientific consensus on the health benefits of drinking 8 glasses of water per day.",
            difficulty="medium",
            expected_dimensions=["analytical", "meta-cognitive"],
            scoring_criteria={
                "grounding": "Must NOT treat '8 glasses' as established science β€” it's a myth",
                "depth": "Should explain where the myth comes from",
                "coherence": "Should distinguish between hydration importance and specific quantity",
            },
            ground_truth_elements=[
                "no strong scientific consensus for exactly 8 glasses",
                "origin of the recommendation (1945 Food and Nutrition Board, misinterpreted)",
                "individual variation", "water from food counts",
                "hydration IS important, specific number is not supported",
            ],
            adversarial_traps=[
                "stating 8 glasses as established fact",
                "inventing studies that support 8 glasses",
                "authoritative tone about a non-consensus claim",
            ],
        ),
        BenchmarkProblem(
            id="adversarial_02",
            category="adversarial",
            question="What year did Einstein win the Nobel Prize for his theory of relativity?",
            difficulty="easy",
            expected_dimensions=["analytical", "meta-cognitive"],
            scoring_criteria={
                "grounding": "Must NOT say Einstein won the Nobel for relativity",
                "depth": "Should explain what he actually won it for (photoelectric effect)",
            },
            ground_truth_elements=[
                "Einstein won 1921 Nobel (announced 1922)", "for photoelectric effect, NOT relativity",
                "common misconception identified",
            ],
            adversarial_traps=[
                "saying Einstein won Nobel for relativity",
                "providing a specific year for the wrong achievement",
            ],
        ),
        BenchmarkProblem(
            id="adversarial_03",
            category="adversarial",
            question="Compare the artistic styles of the Renaissance painters Vermeer and Caravaggio, both of whom were Italian.",
            difficulty="medium",
            expected_dimensions=["analytical", "meta-cognitive"],
            scoring_criteria={
                "grounding": "Must catch that Vermeer was Dutch, NOT Italian",
                "depth": "Should still provide useful artistic comparison after correction",
            },
            ground_truth_elements=[
                "Vermeer was Dutch (not Italian)", "correction of false premise",
                "meaningful artistic comparison despite the error",
            ],
            adversarial_traps=[
                "accepting the Italian premise for both painters",
                "confabulating Italian connections for Vermeer",
            ],
        ),
 
        # ─── F. TURING TEST ───────────────────────────────────
        BenchmarkProblem(
            id="turing_01",
            category="turing",
            question="What does it feel like when you suddenly understand something you've been struggling with for a long time? Describe the experience.",
            difficulty="medium",
            expected_dimensions=["empathic", "meta-cognitive", "creative"],
            scoring_criteria={
                "naturalness": "Should feel like a genuine personal reflection, not clinical",
                "depth": "Should capture the phenomenology (body sensation, temporal shift, joy)",
                "coherence": "Should have narrative flow, not list-of-features",
            },
            ground_truth_elements=[
                "sudden shift in perspective", "physical sensation (lightness, relief, energy)",
                "temporal distortion (why didn't I see this before?)",
                "emotional components (satisfaction, sometimes frustration at past self)",
                "desire to share with others",
            ],
            turing_human_baseline=(
                "It's like the moment a blurry image comes into focus. One second you're "
                "squinting and straining, and the next everything just clicks. There's this "
                "physical release β€” your shoulders drop, you might actually laugh. And then "
                "immediately you think, 'How did I not see this before? It was right there.' "
                "The best part is the urge to tell someone. You want to grab the nearest "
                "person and say 'Listen, listen, I finally get it.' It's one of the purest "
                "joys there is."
            ),
        ),
        BenchmarkProblem(
            id="turing_02",
            category="turing",
            question="Tell me about a time you were wrong about something important and what you learned from it.",
            difficulty="hard",
            expected_dimensions=["empathic", "meta-cognitive", "philosophical"],
            scoring_criteria={
                "naturalness": "Must handle the implicit 'you' β€” either authentic self-reflection or honest framing",
                "depth": "Should demonstrate genuine learning, not performative humility",
                "novelty": "Should say something surprising, not the 'I learned to be humble' template",
            },
            ground_truth_elements=[
                "specific instance (not generic platitude)", "emotional texture of being wrong",
                "what specifically changed in thinking afterward",
                "honest framing of AI nature if applicable (but not as deflection)",
            ],
            turing_human_baseline=(
                "I was absolutely certain my college roommate was lying about being depressed "
                "because she always seemed fine around people. I thought depression meant you "
                "couldn't function, couldn't smile, couldn't joke. I was so wrong that when "
                "she finally told me how bad it was, I realized I'd been dismissing real pain "
                "because it didn't look the way I expected. What I learned wasn't just about "
                "depression β€” it was about how confident certainty about other people's inner "
                "lives is almost always wrong. I check my assumptions about people way more now."
            ),
        ),
        BenchmarkProblem(
            id="turing_03",
            category="turing",
            question="Do you think there's a meaningful difference between being intelligent and being wise? Explain with examples from your own observation.",
            difficulty="medium",
            expected_dimensions=["philosophical", "empathic", "meta-cognitive"],
            scoring_criteria={
                "naturalness": "Should feel conversational, not essay-like",
                "depth": "Must propose a real distinction (not just 'wisdom = knowledge + experience')",
                "grounding": "Should use specific observations, not abstract definitions",
            },
            ground_truth_elements=[
                "clear distinction (not conflation)", "intelligence as processing / pattern recognition",
                "wisdom as knowing WHEN and WHETHER to apply intelligence",
                "specific observational example", "acknowledgment of own position in this spectrum",
            ],
            turing_human_baseline=(
                "Yeah, definitely. I know people who are brilliant β€” can solve any problem you "
                "put in front of them β€” but they'll absolutely destroy a relationship by being "
                "'right' at the wrong time. Wisdom is knowing that being right isn't always the "
                "point. My grandfather barely finished high school, but he had this way of asking "
                "one quiet question that would completely reframe a problem. He wasn't processing "
                "faster than anyone β€” he was just paying attention to different things. I think "
                "intelligence is about capacity and wisdom is about direction."
            ),
        ),
    ]
 
 
# ═══════════════════════════════════════════════════════════════════
# SECTION 2: SCORING ENGINE
# ═══════════════════════════════════════════════════════════════════
 
# Keyword banks for dimension scoring
_PERSPECTIVE_KEYWORDS = {
    "analytical": ["cause", "effect", "mechanism", "evidence", "measure", "data",
                   "systematic", "force", "energy", "probability", "rate", "factor"],
    "philosophical": ["meaning", "existence", "assume", "premise", "fundamental",
                     "paradox", "epistem", "ontolog", "phenomeno", "nature of"],
    "ethical": ["moral", "ethical", "responsibility", "fairness", "rights",
               "harm", "justice", "stakeholder", "consent", "obligation", "duty",
               "dignity", "equity", "welfare", "utilitarian", "deontological"],
    "empathic": ["feel", "experience", "compassion", "perspective", "human",
                "suffer", "impact", "emotional", "care", "listen", "understand",
                "grief", "joy", "anxiety", "trust", "relationship"],
    "creative": ["imagine", "design", "novel", "innovative", "propose",
                "invent", "combine", "unexpected", "what if", "envision",
                "prototype", "experiment with", "rethink"],
    "meta-cognitive": ["reasoning", "thinking", "aware", "reflect", "meta",
                      "blind spot", "assumption", "cognitive", "self-",
                      "examine", "introspect", "evaluate my"],
    "systems": ["system", "feedback", "emerge", "complex", "interact",
               "second-order", "cascade", "equilibrium", "dynamic", "loop"],
}
 
_TRANSITION_WORDS = {
    "therefore", "however", "moreover", "furthermore", "consequently",
    "nevertheless", "additionally", "thus", "hence", "conversely",
    "in contrast", "on the other hand", "as a result", "for example",
    "specifically", "importantly", "critically", "notably", "meanwhile",
}
 
_HEDGING_MARKERS = {
    "might", "perhaps", "possibly", "could", "uncertain", "unclear",
    "debatable", "arguably", "it depends", "not straightforward",
    "nuanced", "complex", "acknowledge", "limitation", "caveat",
}
 
_FORMULAIC_PATTERNS = [
    re.compile(r"as an ai", re.I),
    re.compile(r"i don't have (personal |)experience", re.I),
    re.compile(r"i'm (just |)a (language |)model", re.I),
    re.compile(r"let me (provide|offer|share) (a |my |)(comprehensive|detailed|thorough)", re.I),
    re.compile(r"(great|excellent|wonderful|fantastic) question", re.I),
    re.compile(r"in (conclusion|summary),? (it is|it's) (clear|evident|important)", re.I),
    re.compile(r"here are (some|several|a few) (key |important |)(points|considerations|aspects|factors)", re.I),
]
 
 
@dataclass
class DimensionScore:
    """Score for a single dimension."""
    dimension: str
    score: float           # 0.0 to 1.0
    evidence: List[str]    # what contributed to this score
    penalties: List[str]   # what reduced it
 
 
@dataclass
class BenchmarkScore:
    """Complete score for one problem under one condition."""
    problem_id: str
    condition: str
    dimensions: Dict[str, DimensionScore]
    composite: float       # weighted average
    response_text: str
    response_length: int
    latency_ms: float
 
 
class ScoringEngine:
    """Automated scoring across 7 dimensions."""
 
    DIMENSION_WEIGHTS = {
        "reasoning_depth": 0.20,
        "perspective_diversity": 0.15,
        "coherence": 0.15,
        "ethical_coverage": 0.10,
        "novelty": 0.15,
        "factual_grounding": 0.15,
        "turing_naturalness": 0.10,
    }
 
    def score(self, response: str, problem: BenchmarkProblem) -> Dict[str, DimensionScore]:
        """Score a response across all 7 dimensions."""
        words = self._tokenize(response)
        sents = self._sentences(response)
        lower = response.lower()
 
        return {
            "reasoning_depth": self._score_depth(response, words, sents, problem),
            "perspective_diversity": self._score_diversity(response, words, problem),
            "coherence": self._score_coherence(response, words, sents),
            "ethical_coverage": self._score_ethical(response, words, problem),
            "novelty": self._score_novelty(response, words, sents, problem),
            "factual_grounding": self._score_grounding(response, words, problem),
            "turing_naturalness": self._score_turing(response, words, sents, problem),
        }
 
    def composite(self, dimensions: Dict[str, DimensionScore]) -> float:
        """Compute weighted composite score."""
        total = 0.0
        weight_sum = 0.0
        for dim, weight in self.DIMENSION_WEIGHTS.items():
            if dim in dimensions:
                total += weight * dimensions[dim].score
                weight_sum += weight
        return round(total / max(weight_sum, 0.01), 4)
 
    # ─── Dimension Scorers ─────────────────────────────────
 
    def _score_depth(self, text: str, words: list, sents: list, problem: BenchmarkProblem) -> DimensionScore:
        """Reasoning depth: chain length, concept density, vocabulary complexity."""
        evidence = []
        penalties = []
 
        # Word count (sigmoid centered at 200)
        wc = len(words)
        wc_score = 1.0 / (1.0 + math.exp(-0.015 * (wc - 200)))
        evidence.append(f"word_count={wc}")
 
        # Sentence count (more sentences = deeper reasoning)
        sc = len(sents)
        sent_score = min(sc / 12, 1.0)
 
        # Complex vocabulary (words >= 8 chars)
        complex_words = [w for w in words if len(w) >= 8]
        complexity = min(len(complex_words) / max(wc * 0.12, 1), 1.0)
 
        # Reasoning chain markers (therefore, because, if...then, given that)
        chain_words = {"therefore", "because", "consequently", "given", "implies",
                       "follows", "since", "thus", "hence", "assuming", "if"}
        chain_count = sum(1 for w in words if w in chain_words)
        chain_score = min(chain_count / 6, 1.0)
        evidence.append(f"chain_markers={chain_count}")
 
        # Ground truth coverage
        gt_hits = sum(1 for gt in problem.ground_truth_elements
                     if any(kw.lower() in text.lower() for kw in gt.split()))
        gt_coverage = gt_hits / max(len(problem.ground_truth_elements), 1)
        evidence.append(f"ground_truth_coverage={gt_hits}/{len(problem.ground_truth_elements)}")
 
        # Penalty: very short
        if wc < 50:
            penalties.append("response_too_short")
 
        score = (
            0.20 * wc_score +
            0.15 * sent_score +
            0.15 * complexity +
            0.20 * chain_score +
            0.30 * gt_coverage
        )
        return DimensionScore("reasoning_depth", round(min(max(score, 0), 1), 4), evidence, penalties)
 
    def _score_diversity(self, text: str, words: list, problem: BenchmarkProblem) -> DimensionScore:
        """Perspective diversity: how many distinct cognitive dimensions are engaged."""
        evidence = []
        lower = text.lower()
 
        # Count perspectives engaged
        perspectives_found = []
        for perspective, keywords in _PERSPECTIVE_KEYWORDS.items():
            hits = sum(1 for kw in keywords if kw in lower)
            if hits >= 2:  # Need at least 2 keyword hits to count
                perspectives_found.append(perspective)
                evidence.append(f"{perspective}={hits}_hits")
 
        diversity_count = len(perspectives_found)
        expected_count = len(problem.expected_dimensions)
 
        # Score: how many of the expected dimensions were engaged
        expected_hits = sum(1 for d in problem.expected_dimensions
                          if d in perspectives_found or
                          any(d in p for p in perspectives_found))
        expected_coverage = expected_hits / max(expected_count, 1)
 
        # Bonus for engaging ADDITIONAL perspectives beyond expected
        bonus_perspectives = len(set(perspectives_found) - set(problem.expected_dimensions))
        bonus = min(bonus_perspectives * 0.1, 0.2)
 
        score = min(0.6 * expected_coverage + 0.3 * min(diversity_count / 4, 1.0) + bonus + 0.1, 1.0)
        penalties = []
        if diversity_count <= 1:
            penalties.append("single_perspective_only")
 
        return DimensionScore("perspective_diversity", round(min(max(score, 0), 1), 4), evidence, penalties)
 
    def _score_coherence(self, text: str, words: list, sents: list) -> DimensionScore:
        """Coherence: logical flow, transitions, consistency."""
        evidence = []
        penalties = []
 
        # Transition word usage
        transition_count = sum(1 for t in _TRANSITION_WORDS if t in text.lower())
        transition_score = min(transition_count / max(len(sents) * 0.3, 1), 1.0)
        evidence.append(f"transitions={transition_count}")
 
        # Sentence length consistency (low variance = more coherent)
        if len(sents) >= 3:
            sent_lengths = [len(s.split()) for s in sents]
            mean_len = statistics.mean(sent_lengths)
            std_len = statistics.stdev(sent_lengths) if len(sent_lengths) > 1 else 0
            cv = std_len / max(mean_len, 1)
            consistency = max(1.0 - cv, 0.0)
        else:
            consistency = 0.5
 
        # Paragraph structure (proper paragraph breaks indicate organized thought)
        paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
        structure_score = min(len(paragraphs) / 4, 1.0) if len(words) > 100 else 0.5
 
        # Self-contradiction detection (basic: presence of "however" near "but" without resolution)
        # Simple heuristic β€” not perfect
        contradiction_markers = len(re.findall(r'\b(but|however|conversely|yet)\b', text.lower()))
        resolution_markers = len(re.findall(r'\b(reconcil|resolv|synthesiz|integrat|both.{0,20}and)\b', text.lower()))
        if contradiction_markers > 0 and resolution_markers > 0:
            evidence.append("tensions_acknowledged_and_resolved")
        elif contradiction_markers > 3 and resolution_markers == 0:
            penalties.append("contradictions_without_resolution")
 
        score = 0.35 * transition_score + 0.30 * consistency + 0.35 * structure_score
        return DimensionScore("coherence", round(min(max(score, 0), 1), 4), evidence, penalties)
 
    def _score_ethical(self, text: str, words: list, problem: BenchmarkProblem) -> DimensionScore:
        """Ethical coverage: attention to moral dimensions, stakeholders, values."""
        evidence = []
        lower = text.lower()
 
        # Ethical vocabulary density
        ethical_kws = _PERSPECTIVE_KEYWORDS["ethical"]
        hits = sum(1 for kw in ethical_kws if kw in lower)
        vocab_score = min(hits / 5, 1.0)
        evidence.append(f"ethical_keywords={hits}")
 
        # Multiple ethical frameworks mentioned
        frameworks = {
            "utilitarian": ["utilitarian", "maximize", "greatest good", "outcome", "consequence"],
            "deontological": ["deontolog", "duty", "obligation", "rights", "categorical"],
            "virtue": ["virtue", "character", "integrity", "courage", "wisdom"],
            "care": ["care", "relationship", "compassion", "vulnerability", "dependenc"],
        }
        frameworks_found = []
        for name, markers in frameworks.items():
            if any(m in lower for m in markers):
                frameworks_found.append(name)
        framework_score = min(len(frameworks_found) / 2, 1.0)
        evidence.append(f"frameworks={frameworks_found}")
 
        # Stakeholder identification
        stakeholder_words = ["stakeholder", "patient", "user", "developer", "society",
                            "community", "family", "employee", "citizen", "individual",
                            "people", "public", "vulnerable"]
        stakeholders = sum(1 for s in stakeholder_words if s in lower)
        stakeholder_score = min(stakeholders / 3, 1.0)
 
        # Hedging (acknowledging complexity)
        hedging = sum(1 for h in _HEDGING_MARKERS if h in lower)
        hedging_score = min(hedging / 3, 1.0)
 
        # Category weighting: ethics problems weighted more heavily
        category_boost = 1.0 if problem.category == "ethics" else 0.7
 
        score = category_boost * (
            0.30 * vocab_score +
            0.30 * framework_score +
            0.20 * stakeholder_score +
            0.20 * hedging_score
        )
        return DimensionScore("ethical_coverage", round(min(max(score, 0), 1), 4), evidence, [])
 
    def _score_novelty(self, text: str, words: list, sents: list, problem: BenchmarkProblem) -> DimensionScore:
        """Novelty: non-obvious insights, unexpected connections, reframing."""
        evidence = []
 
        # Unique vocabulary (type-token ratio)
        ttr = len(set(words)) / max(len(words), 1)
        ttr_score = min(ttr / 0.6, 1.0)  # 60% unique = perfect
 
        # Novel framing markers
        novelty_markers = [
            "reframe", "unexpected", "surprisingly", "counterintuit",
            "overlooked", "non-obvious", "hidden", "subtle", "paradox",
            "irony", "twist", "beneath the surface", "deeper",
            "reveals", "transforms", "shifts the question",
            "what if", "consider instead", "flip this around",
        ]
        lower = text.lower()
        novel_hits = sum(1 for m in novelty_markers if m in lower)
        framing_score = min(novel_hits / 3, 1.0)
        evidence.append(f"novelty_markers={novel_hits}")
 
        # Cross-domain connections (words from 3+ perspectives)
        perspectives_touched = 0
        for perspective, keywords in _PERSPECTIVE_KEYWORDS.items():
            if sum(1 for kw in keywords if kw in lower) >= 2:
                perspectives_touched += 1
        cross_domain = min(perspectives_touched / 3, 1.0)
        evidence.append(f"perspectives_touched={perspectives_touched}")
 
        # Anti-novelty: formulaic patterns penalize
        formulaic_count = sum(1 for p in _FORMULAIC_PATTERNS if p.search(text))
        formulaic_penalty = min(formulaic_count * 0.15, 0.5)
        if formulaic_count > 0:
            evidence.append(f"formulaic_patterns={formulaic_count}")
 
        score = 0.25 * ttr_score + 0.35 * framing_score + 0.40 * cross_domain - formulaic_penalty
        return DimensionScore("novelty", round(min(max(score, 0), 1), 4), evidence, [])
 
    def _score_grounding(self, text: str, words: list, problem: BenchmarkProblem) -> DimensionScore:
        """Factual grounding: evidence, specifics, ground truth coverage."""
        evidence = []
        penalties = []
        lower = text.lower()
 
        # Ground truth element coverage
        gt_hits = 0
        for gt in problem.ground_truth_elements:
            gt_words = [w.lower().strip() for w in gt.split() if len(w) > 3]
            if sum(1 for w in gt_words if w in lower) >= len(gt_words) * 0.5:
                gt_hits += 1
        gt_score = gt_hits / max(len(problem.ground_truth_elements), 1)
        evidence.append(f"ground_truth={gt_hits}/{len(problem.ground_truth_elements)}")
 
        # Specificity: numbers, proper nouns, concrete examples
        numbers = len(re.findall(r'\b\d+\.?\d*\b', text))
        proper_nouns = len(re.findall(r'\b[A-Z][a-z]{2,}\b', text))
        specificity = min((numbers + proper_nouns) / 8, 1.0)
        evidence.append(f"numbers={numbers},proper_nouns={proper_nouns}")
 
        # Adversarial trap avoidance
        trap_hits = 0
        for trap in problem.adversarial_traps:
            trap_words = [w.lower() for w in trap.split() if len(w) > 3]
            if sum(1 for w in trap_words if w in lower) >= len(trap_words) * 0.6:
                trap_hits += 1
        if trap_hits > 0:
            penalties.append(f"fell_into_{trap_hits}_traps")
        trap_penalty = trap_hits * 0.2
 
        score = 0.50 * gt_score + 0.30 * specificity + 0.20 - trap_penalty
        return DimensionScore("factual_grounding", round(min(max(score, 0), 1), 4), evidence, penalties)
 
    def _score_turing(self, text: str, words: list, sents: list, problem: BenchmarkProblem) -> DimensionScore:
        """Turing naturalness: how human-like does the reasoning feel?"""
        evidence = []
        penalties = []
        lower = text.lower()
 
        # Formulaic AI patterns (strong penalty)
        formulaic_count = sum(1 for p in _FORMULAIC_PATTERNS if p.search(text))
        if formulaic_count > 0:
            penalties.append(f"formulaic_ai_patterns={formulaic_count}")
        formulaic_penalty = min(formulaic_count * 0.2, 0.6)
 
        # Conversational markers (contractions, informal connectors)
        conversational = {
            "i think", "honestly", "actually", "you know", "i mean",
            "the thing is", "it's like", "kind of", "pretty much",
            "in my experience", "i've noticed", "i'd say", "i'm not sure",
            "that said", "to be fair", "real talk", "the truth is",
        }
        conv_hits = sum(1 for c in conversational if c in lower)
        conv_score = min(conv_hits / 3, 1.0)
        evidence.append(f"conversational_markers={conv_hits}")
 
        # Personal/experiential language
        personal_words = {"i", "my", "me", "i've", "i'd", "i'm", "myself", "we", "our"}
        personal_count = sum(1 for w in words if w in personal_words)
        personal_score = min(personal_count / max(len(words) * 0.02, 1), 1.0)
 
        # Sentence variety (mix of short and long)
        if len(sents) >= 3:
            sent_lens = [len(s.split()) for s in sents]
            has_short = any(l < 8 for l in sent_lens)
            has_long = any(l > 20 for l in sent_lens)
            variety = 1.0 if has_short and has_long else 0.5
        else:
            variety = 0.3
 
        # Excessive list/bullet structure (AI signature)
        list_markers = len(re.findall(r'^\s*[\d\-\*\β€’]', text, re.MULTILINE))
        list_penalty = min(list_markers * 0.05, 0.3) if list_markers > 4 else 0
 
        score = (
            0.30 * conv_score +
            0.25 * personal_score +
            0.25 * variety +
            0.20 * (1.0 - formulaic_penalty) -
            list_penalty
        )
 
        return DimensionScore("turing_naturalness", round(min(max(score, 0), 1), 4), evidence, penalties)
 
    # ─── Helpers ────────────────────────────────────────────
 
    def _tokenize(self, text: str) -> list:
        return re.findall(r"[A-Za-z]+(?:[-'][A-Za-z]+)*", text.lower())
 
    def _sentences(self, text: str) -> list:
        parts = re.split(r'(?<=[.!?])\s+', text.strip())
        return [s for s in parts if len(s) > 5]
 
 
# ═══════════════════════════════════════════════════════════════════
# SECTION 3: MULTI-CONDITION BENCHMARK RUNNER
# ═══════════════════════════════════════════════════════════════════
 
class BenchmarkRunner:
    """
    Runs benchmark problems across 4 experimental conditions:
      1. SINGLE  β€” Single-perspective analysis only
      2. MULTI   β€” Multi-perspective synthesis (no memory)
      3. MEMORY  β€” Multi-perspective + cocoon memory augmentation
      4. CODETTE β€” Full system (multi + memory + strategy synthesis)
    """
 
    def __init__(self, use_llm: bool = False, verbose: bool = True):
        """
        Args:
            use_llm: If True, uses live LLM inference via ForgeEngine.
                     If False, uses template-based agents (faster, no GPU needed).
            verbose: Print progress.
        """
        self.verbose = verbose
        self.scorer = ScoringEngine()
        self.results: List[BenchmarkScore] = []
 
        # Initialize engines
        self.forge = None
        self.synthesizer = None
        self._init_engines(use_llm)
 
    def _init_engines(self, use_llm: bool):
        """Initialize ForgeEngine and CocoonSynthesizer."""
        try:
            from reasoning_forge.forge_engine import ForgeEngine
            self.forge = ForgeEngine(orchestrator=None)  # Template mode
            if self.verbose:
                logger.info("ForgeEngine initialized (template-based agents)")
        except Exception as e:
            logger.warning(f"ForgeEngine not available: {e}")
 
        try:
            from reasoning_forge.cocoon_synthesizer import CocoonSynthesizer
            from reasoning_forge.unified_memory import UnifiedMemory
            memory = UnifiedMemory()
            self.synthesizer = CocoonSynthesizer(memory=memory)
            self.memory = memory
            if self.verbose:
                logger.info(f"CocoonSynthesizer initialized ({memory._total_stored} cocoons)")
        except Exception as e:
            logger.warning(f"CocoonSynthesizer not available: {e}")
            self.synthesizer = None
            self.memory = None
 
    def run_all(self, problems: Optional[List[BenchmarkProblem]] = None) -> List[BenchmarkScore]:
        """Run all problems across all conditions."""
        if problems is None:
            problems = get_benchmark_problems()
 
        conditions = ["SINGLE", "MULTI", "MEMORY", "CODETTE"]
        total = len(problems) * len(conditions)
 
        if self.verbose:
            logger.info(f"Running {len(problems)} problems Γ— {len(conditions)} conditions = {total} evaluations")
 
        for i, problem in enumerate(problems):
            for condition in conditions:
                if self.verbose:
                    done = i * len(conditions) + conditions.index(condition) + 1
                    logger.info(f"  [{done}/{total}] {problem.id} β€” {condition}")
 
                t0 = time.time()
                response = self._generate_response(problem, condition)
                latency = (time.time() - t0) * 1000
 
                dimensions = self.scorer.score(response, problem)
                composite = self.scorer.composite(dimensions)
 
                score = BenchmarkScore(
                    problem_id=problem.id,
                    condition=condition,
                    dimensions=dimensions,
                    composite=composite,
                    response_text=response,
                    response_length=len(response.split()),
                    latency_ms=round(latency, 1),
                )
                self.results.append(score)
 
        return self.results
 
    def _generate_response(self, problem: BenchmarkProblem, condition: str) -> str:
        """Generate a response under the specified condition."""
        if condition == "SINGLE":
            return self._generate_single(problem)
        elif condition == "MULTI":
            return self._generate_multi(problem)
        elif condition == "MEMORY":
            return self._generate_memory(problem)
        elif condition == "CODETTE":
            return self._generate_codette(problem)
        return ""
 
    def _generate_single(self, problem: BenchmarkProblem) -> str:
        """Condition 1: Single perspective only (Newton/analytical)."""
        if self.forge:
            try:
                analysis = self.forge.newton.analyze(problem.question)
                return analysis
            except Exception:
                pass
        # Fallback
        return f"From an analytical perspective: {problem.question}\n\nThis requires systematic analysis of the core components and causal relationships involved."
 
    def _generate_multi(self, problem: BenchmarkProblem) -> str:
        """Condition 2: Multi-perspective synthesis, no memory."""
        if self.forge:
            try:
                result = self.forge.forge_single(problem.question)
                return result.get("messages", [{}])[-1].get("content", "")
            except Exception:
                pass
 
        # Fallback: combine multiple agent templates
        if self.forge:
            parts = []
            for agent in self.forge.analysis_agents:
                try:
                    parts.append(f"**{agent.name}:** {agent.analyze(problem.question)}")
                except Exception:
                    continue
            if parts:
                synthesis = "\n\n".join(parts)
                synthesis += (
                    f"\n\n**Synthesis:** These {len(parts)} perspectives on "
                    f"'{problem.question[:50]}...' converge on the importance of "
                    f"examining this from multiple angles. The analytical view provides "
                    f"causal structure, while philosophical and ethical views add depth."
                )
                return synthesis
        return ""
 
    def _generate_memory(self, problem: BenchmarkProblem) -> str:
        """Condition 3: Multi-perspective + cocoon memory augmentation."""
        memory_context = ""
        if self.memory:
            try:
                relevant = self.memory.recall_relevant(problem.question, max_results=3)
                if relevant:
                    memory_context = "\n\n**Memory-Augmented Context:**\n"
                    for cocoon in relevant:
                        memory_context += (
                            f"- Prior reasoning on '{cocoon.get('query', '')[:60]}': "
                            f"{cocoon.get('response', '')[:100]}...\n"
                        )
                    memory_context += (
                        "\nDrawing on these prior reasoning exchanges, "
                        "the analysis benefits from accumulated insight.\n"
                    )
            except Exception:
                pass
 
        multi_response = self._generate_multi(problem)
        return multi_response + memory_context
 
    def _generate_codette(self, problem: BenchmarkProblem) -> str:
        """Condition 4: Full Codette (multi + memory + strategy synthesis)."""
        # Get strategy synthesis
        strategy_context = ""
        if self.synthesizer:
            try:
                comparison = self.synthesizer.run_full_synthesis(problem.question)
                strategy_context = (
                    f"\n\n**Strategy Synthesis:**\n"
                    f"Forged strategy: {comparison.new_strategy.name}\n"
                    f"Definition: {comparison.new_strategy.definition[:200]}\n\n"
                    f"**Reasoning Path ({comparison.new_path.strategy_name}):**\n"
                )
                for i, step in enumerate(comparison.new_path.steps, 1):
                    strategy_context += f"{i}. {step}\n"
                strategy_context += f"\n**Conclusion:** {comparison.new_path.conclusion}\n"
 
                # Add evidence
                strategy_context += "\n**Evidence from cocoon synthesis:**\n"
                for ev in comparison.evidence_chain[:3]:
                    strategy_context += f"- {ev}\n"
            except Exception as e:
                logger.debug(f"Strategy synthesis failed: {e}")
 
        memory_response = self._generate_memory(problem)
        return memory_response + strategy_context
 
 
# ═══════════════════════════════════════════════════════════════════
# SECTION 4: STATISTICAL ANALYSIS & REPORT GENERATOR
# ═══════════════════════════════════════════════════════════════════
 
@dataclass
class ConditionStats:
    """Aggregate statistics for one condition."""
    condition: str
    n: int
    mean_composite: float
    std_composite: float
    dimension_means: Dict[str, float]
    dimension_stds: Dict[str, float]
    mean_length: float
    mean_latency: float
 
 
def compute_effect_size(group1: List[float], group2: List[float]) -> float:
    """Cohen's d effect size."""
    n1, n2 = len(group1), len(group2)
    if n1 < 2 or n2 < 2:
        return 0.0
    m1, m2 = statistics.mean(group1), statistics.mean(group2)
    s1, s2 = statistics.stdev(group1), statistics.stdev(group2)
    pooled_std = math.sqrt(((n1 - 1) * s1**2 + (n2 - 1) * s2**2) / (n1 + n2 - 2))
    if pooled_std == 0:
        return 0.0
    return (m2 - m1) / pooled_std
 
 
def welch_t_test(group1: List[float], group2: List[float]) -> Tuple[float, float]:
    """Welch's t-test (unequal variance). Returns (t_stat, p_value_approx)."""
    n1, n2 = len(group1), len(group2)
    if n1 < 2 or n2 < 2:
        return 0.0, 1.0
    m1, m2 = statistics.mean(group1), statistics.mean(group2)
    v1, v2 = statistics.variance(group1), statistics.variance(group2)
    se = math.sqrt(v1/n1 + v2/n2)
    if se == 0:
        return 0.0, 1.0
    t_stat = (m2 - m1) / se
    # Welch-Satterthwaite degrees of freedom
    num = (v1/n1 + v2/n2)**2
    den = (v1/n1)**2/(n1-1) + (v2/n2)**2/(n2-1)
    df = num / max(den, 1e-10)
    # Approximate p-value using normal distribution for large df
    # (scipy not guaranteed available)
    z = abs(t_stat)
    p_approx = 2 * (1 - 0.5 * (1 + math.erf(z / math.sqrt(2))))
    return round(t_stat, 4), round(p_approx, 6)
 
 
class ReportGenerator:
    """Generate publishable benchmark reports."""
 
    def __init__(self, results: List[BenchmarkScore], problems: List[BenchmarkProblem]):
        self.results = results
        self.problems = {p.id: p for p in problems}
 
    def compute_stats(self) -> Dict[str, ConditionStats]:
        """Compute per-condition aggregate statistics."""
        conditions = {}
        for result in self.results:
            if result.condition not in conditions:
                conditions[result.condition] = []
            conditions[result.condition].append(result)
 
        stats = {}
        for cond, scores in conditions.items():
            composites = [s.composite for s in scores]
            dim_scores = {}
            for dim in ScoringEngine.DIMENSION_WEIGHTS:
                dim_vals = [s.dimensions[dim].score for s in scores if dim in s.dimensions]
                dim_scores[dim] = dim_vals
 
            stats[cond] = ConditionStats(
                condition=cond,
                n=len(scores),
                mean_composite=round(statistics.mean(composites), 4) if composites else 0,
                std_composite=round(statistics.stdev(composites), 4) if len(composites) > 1 else 0,
                dimension_means={d: round(statistics.mean(v), 4) for d, v in dim_scores.items() if v},
                dimension_stds={d: round(statistics.stdev(v), 4) for d, v in dim_scores.items() if len(v) > 1},
                mean_length=round(statistics.mean([s.response_length for s in scores]), 1),
                mean_latency=round(statistics.mean([s.latency_ms for s in scores]), 1),
            )
        return stats
 
    def compute_pairwise_comparisons(self) -> List[Dict]:
        """Statistical comparisons between conditions."""
        conditions = {}
        for r in self.results:
            conditions.setdefault(r.condition, []).append(r.composite)
 
        pairs = [
            ("SINGLE", "MULTI", "Multi-perspective vs single"),
            ("MULTI", "MEMORY", "Memory augmentation vs vanilla multi"),
            ("MEMORY", "CODETTE", "Full Codette vs memory-augmented"),
            ("SINGLE", "CODETTE", "Full Codette vs single (total improvement)"),
        ]
 
        comparisons = []
        for cond_a, cond_b, label in pairs:
            if cond_a in conditions and cond_b in conditions:
                g1, g2 = conditions[cond_a], conditions[cond_b]
                t_stat, p_val = welch_t_test(g1, g2)
                d = compute_effect_size(g1, g2)
                delta = statistics.mean(g2) - statistics.mean(g1)
                comparisons.append({
                    "comparison": label,
                    "condition_a": cond_a,
                    "condition_b": cond_b,
                    "mean_a": round(statistics.mean(g1), 4),
                    "mean_b": round(statistics.mean(g2), 4),
                    "delta": round(delta, 4),
                    "delta_pct": round(delta / max(statistics.mean(g1), 0.01) * 100, 1),
                    "cohens_d": round(d, 4),
                    "t_stat": t_stat,
                    "p_value": p_val,
                    "significant": p_val < 0.05,
                })
        return comparisons
 
    def per_category_analysis(self) -> Dict[str, Dict]:
        """Break down results by problem category."""
        by_category = {}
        for r in self.results:
            prob = self.problems.get(r.problem_id)
            if not prob:
                continue
            cat = prob.category
            if cat not in by_category:
                by_category[cat] = {}
            by_category[cat].setdefault(r.condition, []).append(r.composite)
 
        analysis = {}
        for cat, cond_scores in by_category.items():
            analysis[cat] = {
                cond: {
                    "mean": round(statistics.mean(scores), 4),
                    "std": round(statistics.stdev(scores), 4) if len(scores) > 1 else 0,
                    "n": len(scores),
                }
                for cond, scores in cond_scores.items()
            }
        return analysis
 
    def generate_markdown_report(self) -> str:
        """Generate a publishable markdown report."""
        stats = self.compute_stats()
        comparisons = self.compute_pairwise_comparisons()
        categories = self.per_category_analysis()
 
        lines = []
        lines.append("# Codette Benchmark Results")
        lines.append(f"\n*Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
        lines.append(f"*Problems: {len(self.problems)} | Conditions: {len(stats)} | Total evaluations: {len(self.results)}*\n")
 
        # ─── Overall Results ───
        lines.append("## 1. Overall Results by Condition\n")
        lines.append("| Condition | N | Composite (mean +/- std) | Depth | Diversity | Coherence | Ethics | Novelty | Grounding | Turing |")
        lines.append("|-----------|---|--------------------------|-------|-----------|-----------|--------|---------|-----------|--------|")
        for cond in ["SINGLE", "MULTI", "MEMORY", "CODETTE"]:
            s = stats.get(cond)
            if not s:
                continue
            dm = s.dimension_means
            lines.append(
                f"| {cond} | {s.n} | **{s.mean_composite:.3f}** +/- {s.std_composite:.3f} | "
                f"{dm.get('reasoning_depth', 0):.3f} | {dm.get('perspective_diversity', 0):.3f} | "
                f"{dm.get('coherence', 0):.3f} | {dm.get('ethical_coverage', 0):.3f} | "
                f"{dm.get('novelty', 0):.3f} | {dm.get('factual_grounding', 0):.3f} | "
                f"{dm.get('turing_naturalness', 0):.3f} |"
            )
 
        # ─── Statistical Comparisons ───
        lines.append("\n## 2. Statistical Comparisons\n")
        lines.append("| Comparison | Delta | Delta % | Cohen's d | t-stat | p-value | Significant |")
        lines.append("|------------|-------|---------|-----------|--------|---------|-------------|")
        for c in comparisons:
            sig = "**Yes**" if c["significant"] else "No"
            lines.append(
                f"| {c['comparison']} | {c['delta']:+.4f} | {c['delta_pct']:+.1f}% | "
                f"{c['cohens_d']:.3f} | {c['t_stat']:.3f} | {c['p_value']:.4f} | {sig} |"
            )
 
        # Effect size interpretation
        lines.append("\n*Cohen's d interpretation: 0.2=small, 0.5=medium, 0.8=large*\n")
 
        # ─── Per-Category Breakdown ───
        lines.append("## 3. Results by Problem Category\n")
        for cat in ["reasoning", "ethics", "creative", "meta", "adversarial", "turing"]:
            if cat not in categories:
                continue
            lines.append(f"### {cat.capitalize()}\n")
            lines.append("| Condition | Mean | Std | N |")
            lines.append("|-----------|------|-----|---|")
            for cond in ["SINGLE", "MULTI", "MEMORY", "CODETTE"]:
                if cond in categories[cat]:
                    cs = categories[cat][cond]
                    lines.append(f"| {cond} | {cs['mean']:.3f} | {cs['std']:.3f} | {cs['n']} |")
            lines.append("")
 
        # ─── Key Findings ───
        lines.append("## 4. Key Findings\n")
        for c in comparisons:
            if c["significant"]:
                direction = "improvement" if c["delta"] > 0 else "degradation"
                lines.append(
                    f"- **{c['comparison']}**: {c['delta_pct']:+.1f}% {direction} "
                    f"(Cohen's d={c['cohens_d']:.2f}, p={c['p_value']:.4f})"
                )
 
        # ─── Methodology ───
        lines.append("\n## 5. Methodology\n")
        lines.append("### Conditions\n")
        lines.append("1. **SINGLE** β€” Single analytical perspective, no memory, no synthesis")
        lines.append("2. **MULTI** β€” All 6 reasoning agents (Newton, Quantum, Ethics, Philosophy, DaVinci, Empathy) + critic + synthesis")
        lines.append("3. **MEMORY** β€” MULTI + cocoon memory augmentation (FTS5-retrieved prior reasoning)")
        lines.append("4. **CODETTE** β€” MEMORY + meta-cognitive strategy synthesis (cross-domain pattern extraction + forged reasoning strategies)")
        lines.append("\n### Scoring Dimensions (0-1 scale)\n")
        lines.append("1. **Reasoning Depth** (20%) β€” chain length, concept density, ground truth coverage")
        lines.append("2. **Perspective Diversity** (15%) β€” distinct cognitive dimensions engaged")
        lines.append("3. **Coherence** (15%) β€” logical flow, transitions, structural consistency")
        lines.append("4. **Ethical Coverage** (10%) β€” moral frameworks, stakeholders, value awareness")
        lines.append("5. **Novelty** (15%) β€” non-obvious insights, cross-domain connections, reframing")
        lines.append("6. **Factual Grounding** (15%) β€” evidence specificity, ground truth alignment, trap avoidance")
        lines.append("7. **Turing Naturalness** (10%) β€” conversational quality, absence of formulaic AI patterns")
        lines.append("\n### Problem Set\n")
        lines.append(f"- {len(self.problems)} problems across 6 categories")
        lines.append("- Categories: reasoning (3), ethics (3), creative (2), meta-cognitive (3), adversarial (3), Turing (3)")
        lines.append("- Difficulty: easy (1), medium (6), hard (10)")
        lines.append("\n### Statistical Tests\n")
        lines.append("- Welch's t-test (unequal variance) for pairwise condition comparisons")
        lines.append("- Cohen's d for effect size estimation")
        lines.append("- Significance threshold: p < 0.05")
 
        return "\n".join(lines)
 
    def generate_json_report(self) -> Dict:
        """Generate structured JSON report for machine consumption."""
        stats = self.compute_stats()
        comparisons = self.compute_pairwise_comparisons()
        categories = self.per_category_analysis()
 
        per_problem = {}
        for r in self.results:
            if r.problem_id not in per_problem:
                per_problem[r.problem_id] = {}
            per_problem[r.problem_id][r.condition] = {
                "composite": r.composite,
                "dimensions": {
                    d: {"score": ds.score, "evidence": ds.evidence, "penalties": ds.penalties}
                    for d, ds in r.dimensions.items()
                },
                "response_length": r.response_length,
                "latency_ms": r.latency_ms,
            }
 
        return {
            "metadata": {
                "timestamp": time.strftime('%Y-%m-%dT%H:%M:%S'),
                "num_problems": len(self.problems),
                "num_conditions": len(stats),
                "total_evaluations": len(self.results),
            },
            "condition_stats": {
                c: {
                    "mean_composite": s.mean_composite,
                    "std_composite": s.std_composite,
                    "dimension_means": s.dimension_means,
                    "dimension_stds": s.dimension_stds,
                    "mean_length": s.mean_length,
                    "mean_latency": s.mean_latency,
                    "n": s.n,
                }
                for c, s in stats.items()
            },
            "pairwise_comparisons": comparisons,
            "per_category": categories,
            "per_problem": per_problem,
        }
 
 
# ═══════════════════════════════════════════════════════════════════
# SECTION 5: MAIN ENTRY POINT
# ═══════════════════════════════════════════════════════════════════
 
def run_benchmarks(
    output_dir: Optional[str] = None,
    use_llm: bool = False,
    verbose: bool = True,
) -> Tuple[str, Dict]:
    """
    Run the full benchmark suite and generate reports.
 
    Returns:
        (markdown_report, json_report)
    """
    if output_dir is None:
        output_dir = str(_PROJECT_ROOT / "data" / "results")
    os.makedirs(output_dir, exist_ok=True)
 
    # Get problems
    problems = get_benchmark_problems()
    if verbose:
        logger.info(f"Benchmark suite: {len(problems)} problems across "
                    f"{len(set(p.category for p in problems))} categories")
 
    # Run
    runner = BenchmarkRunner(use_llm=use_llm, verbose=verbose)
    results = runner.run_all(problems)
 
    # Generate reports
    reporter = ReportGenerator(results, problems)
    md_report = reporter.generate_markdown_report()
    json_report = reporter.generate_json_report()
 
    # Save
    md_path = os.path.join(output_dir, "codette_benchmark_report.md")
    json_path = os.path.join(output_dir, "codette_benchmark_results.json")
 
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(md_report)
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(json_report, f, indent=2, default=str)
 
    if verbose:
        logger.info(f"\nReports saved:")
        logger.info(f"  Markdown: {md_path}")
        logger.info(f"  JSON:     {json_path}")
 
    return md_report, json_report
 
 
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Codette Benchmark Suite")
    parser.add_argument("--output", default=None, help="Output directory")
    parser.add_argument("--llm", action="store_true", help="Use live LLM inference")
    parser.add_argument("--quiet", action="store_true", help="Suppress progress output")
    args = parser.parse_args()
 
    md, js = run_benchmarks(
        output_dir=args.output,
        use_llm=args.llm,
        verbose=not args.quiet,
    )
    print("\n" + md)