import PyPDF2 from neo4j import GraphDatabase import re class OpenClawIngestionPipeline: def __init__(self, neo4j_uri, neo4j_user, neo4j_password): # Initialize the connection to your local Neo4j instance self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password)) def close(self): self.driver.close() def parse_pdf(self, file_path): """Extracts text from a PDF, bypassing raw byte streams.""" print(f"Parsing {file_path}...") try: with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() + "\n" return self.clean_text(text) except Exception as e: return f"Error parsing PDF: {e}" def clean_text(self, text): """Cleans up the extracted text.""" text = re.sub(r'\s+', ' ', text) return text.strip() def inject_to_graph(self, person_name, document_name, content): """Creates Trinity Graph nodes (Social & Knowledge) in Neo4j.""" query = """ MERGE (p:Person {name: $person_name}) MERGE (d:Document {name: $document_name}) MERGE (p)-[:OWNS_DOCUMENT]->(d) SET d.content = $content, d.type = 'Resume', d.ingested_at = timestamp() RETURN p, d """ with self.driver.session() as session: session.run(query, person_name=person_name, document_name=document_name, content=content) print(f"[{document_name}] successfully injected into the Knowledge Graph for {person_name}.") if __name__ == "__main__": # Setup for Bennett's OpenClaw environment pipeline = OpenClawIngestionPipeline("bolt://localhost:7687", "neo4j", "openclaw_admin") # Process the corrupted PDF stream you just uploaded extracted_text = pipeline.parse_pdf("Bennett_Vernon_Resume.pdf") if "Error" not in extracted_text: pipeline.inject_to_graph("Bennett Vernon", "Bennett_Vernon_Resume.pdf", extracted_text) pipeline.close()