import os
import asyncio
import httpx
import numpy as np
import imagehash
from PIL import Image
from io import BytesIO
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from pydantic import BaseModel
from typing import List, Dict, Any, Optional

# --- Production Infrastructure Imports ---
from neo4j import GraphDatabase
from pinecone import Pinecone, ServerlessSpec
import google.generativeai as genai
from telethon import TelegramClient

# ==================================================================
# TIMEZERO: Production Core Engine (No Mocks, No Simulated Latency)
# Architecture: Perceptual Hashing -> LLM Schema -> Vector Embedding 
# -> Distributed Crawl -> Cosine Similarity Filter -> Neo4j Persistence
# ==================================================================

app = FastAPI(title="TimeZero Production Engine")

# --- Environment Configuration ---
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASS = os.getenv("NEO4J_PASS")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
TELEGRAM_API_ID = os.getenv("TELEGRAM_API_ID")
TELEGRAM_API_HASH = os.getenv("TELEGRAM_API_HASH")

# --- Client Initialization ---
# Neo4j (Ephemeral Graph Persistence)
neo4j_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS)) if NEO4J_URI else None

# Pinecone (Vector Similarity Cache & Triage)
if PINECONE_API_KEY:
    pc = Pinecone(api_key=PINECONE_API_KEY)
    pinecone_index = pc.Index("timezero-schemas")
else:
    pinecone_index = None

# Gemini (Extraction & Embedding)
if GEMINI_API_KEY:
    genai.configure(api_key=GEMINI_API_KEY)
    text_model = genai.GenerativeModel('gemini-1.5-pro')
    vision_model = genai.GenerativeModel('gemini-1.5-pro')

# Telethon (Dark Social Firehose)
telegram_client = TelegramClient('timezero_session', TELEGRAM_API_ID, TELEGRAM_API_HASH) if TELEGRAM_API_ID else None


# ==================================================================
# STAGE 1: INGESTION & TRIAGE (The Screenshot Wall)
# ==================================================================
def compute_perceptual_hash(image_bytes: bytes) -> str:
    """Calculates pHash to discard visually identical claims before expensive OCR."""
    img = Image.open(BytesIO(image_bytes))
    return str(imagehash.phash(img))

async def extract_schema_from_image(image_bytes: bytes) -> str:
    """Calls Gemini Vision to extract structural schema from dark channel screenshots."""
    img = Image.open(BytesIO(image_bytes))
    prompt = "Extract the core behavioral claim. Strip all specific adjectives, names, and rhetoric. Format exactly: [Actor Class] -> [Action] -> [Target Class] -> [Context]."
    response = await asyncio.to_thread(vision_model.generate_content, [prompt, img])
    return response.text.strip()

async def extract_schema_from_text(text: str) -> str:
    """Extracts entity-agnostic schema from mutated text claims."""
    prompt = f"Analyze this claim: '{text}'. Strip all specific adjectives, names, and rhetoric. Return ONLY the structural behavioral schema in this exact format: [Actor Class] -> [Action] -> [Target Class] -> [Location Context]."
    response = await asyncio.to_thread(text_model.generate_content, prompt)
    return response.text.strip()

async def get_embedding(text: str) -> List[float]:
    """Generates text-embedding-004 vector for cosine similarity math."""
    result = await asyncio.to_thread(
        genai.embed_content,
        model="models/text-embedding-004",
        content=text,
        task_type="retrieval_document"
    )
    return result['embedding']


# ==================================================================
# STAGE 2: DISTRIBUTED CRAWL (Surface & Dark)
# ==================================================================
async def crawl_gdelt(keywords: str) -> List[Dict[str, Any]]:
    """Surface web footprint. Production GDELT 2.0 API call."""
    url = f"https://api.gdeltproject.org/api/v2/doc/doc?query={keywords}&mode=artlist&maxrecords=50&format=json"
    async with httpx.AsyncClient() as client:
        try:
            resp = await client.get(url, timeout=10.0)
            data = resp.json()
            return [{
                "id": a["url"], 
                "raw_text": a.get("title", "") + " " + a.get("seendate", ""), 
                "source": "GDELT Surface", 
                "timestamp": int(a["seendate"][:8]) # Simplified timestamp for sorting
            } for a in data.get("articles", [])]
        except Exception:
            return []

async def crawl_telegram_dark_social(keywords: str) -> List[Dict[str, Any]]:
    """Penetrates dark social channels using Telethon (requires auth)."""
    if not telegram_client:
        return []
    await telegram_client.start()
    results = []
    # In production, this queries specific known high-risk channels
    async for message in telegram_client.iter_messages('all', search=keywords, limit=20):
        results.append({
            "id": f"tg_{message.id}",
            "raw_text": message.text,
            "source": f"Telegram: {message.chat.title if message.chat else 'Private'}",
            "timestamp": int(message.date.timestamp())
        })
    return results

async def crawl_4plebs(keywords: str) -> List[Dict[str, Any]]:
    """Deep dark web origin tracing."""
    url = f"https://archive.4plebs.org/_/api/chan/search/?text={keywords}"
    headers = {'User-Agent': 'TimeZero/1.0'}
    async with httpx.AsyncClient() as client:
        try:
            resp = await client.get(url, headers=headers, timeout=15.0)
            data = resp.json()
            if "0" in data:
                return [{
                    "id": str(p["num"]),
                    "raw_text": p.get("comment", ""),
                    "source": "4chan /pol/",
                    "timestamp": int(p["timestamp"])
                } for p in data["0"].get("posts", [])]
            return []
        except Exception:
            return []


# ==================================================================
# STAGE 3: SEMANTIC FILTERING & TEMPORAL PRUNING
# ==================================================================
def cosine_similarity(v1: List[float], v2: List[float]) -> float:
    """Mathematical isolation of behavioral schema vs keyword noise."""
    vec1, vec2 = np.array(v1), np.array(v2)
    return float(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))

async def filter_and_prune(schema_vector: List[float], raw_nodes: List[Dict[str, Any]], threshold: float = 0.85):
    """
    Core TimeZero Logic: 
    1. Embeds every crawled node.
    2. Drops nodes that share keywords but fail behavioral similarity.
    3. Sorts the surviving nodes temporally to find Time Zero.
    """
    valid_nodes = []
    
    # Batch embedding would be used in scale production, done sequentially here for clarity
    for node in raw_nodes:
        if not node["raw_text"]: continue
        node_vector = await get_embedding(node["raw_text"])
        sim_score = cosine_similarity(schema_vector, node_vector)
        
        if sim_score >= threshold:
            node["similarity"] = sim_score
            valid_nodes.append(node)
            
    # Temporal Pruning (Oldest = Time Zero)
    valid_nodes.sort(key=lambda x: x["timestamp"])
    return valid_nodes


# ==================================================================
# STAGE 4: PERSISTENCE (Anti-Bloat Cypher)
# ==================================================================
def persist_timezero_passport(schema: str, schema_vector: List[float], time_zero_node: dict, mainstream_node: dict, depth: int):
    """Saves ONLY the verified passport and laundering endpoints to Neo4j."""
    if not neo4j_driver:
        return
        
    query = """
    MERGE (c:Claim {schema: $schema})
    SET c.vector = $vector, c.last_verified = timestamp()
    
    MERGE (tz:Source {id: $tz_id})
    SET tz.name = $tz_name, tz.timestamp = $tz_time
    
    MERGE (ln:Source {id: $main_id})
    SET ln.name = $main_name, ln.timestamp = $main_time
    
    MERGE (tz)-[o:ORIGINATED {similarity: $tz_sim}]->(c)
    MERGE (ln)-[l:LAUNDERED {depth: $depth, similarity: $main_sim}]->(c)
    
    RETURN c
    """
    
    with neo4j_driver.session() as session:
        session.run(query, 
            schema=schema, vector=schema_vector,
            tz_id=time_zero_node["id"], tz_name=time_zero_node["source"], tz_time=time_zero_node["timestamp"], tz_sim=time_zero_node["similarity"],
            main_id=mainstream_node["id"], main_name=mainstream_node["source"], main_time=mainstream_node["timestamp"], main_sim=mainstream_node["similarity"],
            depth=depth
        )


# ==================================================================
# API ENDPOINTS
# ==================================================================
@app.post("/api/trace/text")
async def trace_text_claim(claim: str = Form(...)):
    # 1. Extract & Embed
    schema = await extract_schema_from_text(claim)
    schema_vector = await get_embedding(schema)
    
    # 2. Check Pinecone Cache (15ms return if known schema)
    if pinecone_index:
        cache_match = pinecone_index.query(vector=schema_vector, top_k=1, include_metadata=True)
        if cache_match['matches'] and cache_match['matches'][0]['score'] > 0.95:
            return {"status": "CACHE_HIT", "data": cache_match['matches'][0]['metadata']}

    # 3. Distributed Crawl (Keywords derived from schema)
    keywords = claim.split()[:3] # Simplified keyword generation for crawl targets
    crawl_results = await asyncio.gather(
        crawl_gdelt(" ".join(keywords)),
        crawl_4plebs(" ".join(keywords)),
        crawl_telegram_dark_social(" ".join(keywords))
    )
    
    all_nodes = [node for source_list in crawl_results for node in source_list]
    
    if not all_nodes:
        raise HTTPException(status_code=404, detail="No digital footprint found.")

    # 4. Filter & Prune
    pruned_path = await filter_and_prune(schema_vector, all_nodes, threshold=0.85)
    
    if not pruned_path:
        raise HTTPException(status_code=404, detail="Footprints found, but none matched behavioral schema.")

    time_zero = pruned_path[0]
    mainstream = pruned_path[-1]

    # 5. Persist Ephemeral Graph
    persist_timezero_passport(schema, schema_vector, time_zero, mainstream, len(pruned_path))
    
    # 6. Update Pinecone Cache
    if pinecone_index:
        pinecone_index.upsert(vectors=[{"id": str(hash(schema)), "values": schema_vector, "metadata": {"schema": schema, "time_zero": time_zero["source"]}}])

    return {
        "status": "TRACE_COMPLETE",
        "schema": schema,
        "time_zero_origin": time_zero,
        "mainstream_injection": mainstream,
        "laundering_depth": len(pruned_path)
    }