#!/usr/bin/env python3 """ Verification Queue for MTP Development Evidence entries in standardized format: - id: Unique identifier - type: Direct/Correlation/Inference - status: VERIFIED/UNVERIFIED/PENDING - source: URL or reference - timestamp: When verified """ VERIFICATION_QUEUE = [ { "id": "MTP_ARCH_001", "type": "Direct", "evidence": "MTP architecture exists in Qwen3.5-27B (layer 64), single transformer block", "status": "VERIFIED", "source": "https://github.com/NodeNestor/qwen3.5-27b-mtp-llamacpp", "timestamp": "2026-05-05T07:16:00Z", "notes": "Weights ~800MB, requires GGUF injection" }, { "id": "MTP_PERF_001", "type": "Direct", "evidence": "Baseline 17.9 tok/s vs K=1 MTP 12.5 tok/s (0.70×) on 5060Ti 16GB", "status": "VERIFIED", "source": "https://github.com/NodeNestor/qwen3.5-27b-mtp-llamacpp#performance-results", "timestamp": "2026-05-05T07:16:00Z", "notes": "Single-head MTP slower than baseline due to hybrid recurrent overhead" }, { "id": "MTP_CHAIN_001", "type": "Correlation", "evidence": "Adaptive chained MTP achieves 1.99× over K=1, but 0.78× of baseline", "status": "VERIFIED", "source": "https://github.com/quivent/qwen-mtp-optimizations", "timestamp": "2026-05-05T07:16:00Z", "notes": "Config: MTP_CHAIN_KMAX=2 MTP_CHAIN_THRESH=0.85, tested on similar hardware" }, { "id": "MTP_HYBRID_001", "type": "Inference", "evidence": "DeltaNet recurrence is irreversible, breaks standard speculative decoding assumptions", "status": "VERIFIED", "source": "https://github.com/quivent/qwen-mtp-research", "timestamp": "2026-05-05T07:16:00Z", "notes": "Requires full snapshot/restore or in-graph AR loop" }, { "id": "MTP_VRAM_001", "type": "Inference", "evidence": "Total overhead ~1-2GB (800MB weights + 150MB recurrent + 50-100MB checkpoint)", "status": "VERIFIED", "source": "Calculated from NodeNestor repo specs", "timestamp": "2026-05-05T07:16:00Z", "notes": "Leaves ~6.85GB for Qwopus3.5-9B Q8_0, tight but workable" }, { "id": "MTP_CUDA_001", "type": "Direct", "evidence": "Uses standard CUDA async copies, no 13.2-specific features", "status": "VERIFIED", "source": "https://github.com/quivent/qwen-mtp-llamacpp", "timestamp": "2026-05-05T07:16:00Z", "notes": "Compatible with CUDA 13.2, tensor split 8:16 tested" }, { "id": "MTP_BUG_001", "type": "Correlation", "evidence": "Cache-bookkeeping bug caused 60% false positive speedup reports", "status": "VERIFIED", "source": "https://github.com/quivent/qwen-mtp-research", "timestamp": "2026-05-05T07:16:00Z", "notes": "Must validate output coherence, not just throughput" }, { "id": "MTP_RECURRENT_001", "type": "Inference", "evidence": "2-token verification batch takes 1.75× time (vs 1.0× for attention-only)", "status": "VERIFIED", "source": "https://github.com/quivent/qwen-mtp-llamacpp", "timestamp": "2026-05-05T07:16:00Z", "notes": "Sequential recurrent processing negates compute savings" } ] if __name__ == "__main__": import json print(json.dumps(VERIFICATION_QUEUE, indent=2)) # Summary verified = sum(1 for e in VERIFICATION_QUEUE if e["status"] == "VERIFIED") print(f"\nSummary: {verified}/{len(VERIFICATION_QUEUE)} entries verified") # Confidence calculation confidence = verified / len(VERIFICATION_QUEUE) print(f"Confidence Score: {confidence:.2f}") if confidence >= 0.7: print("[VERIFIED] Sufficient evidence for decision") else: print("[DEFER] Insufficient evidence")