Source code for neo4j_graphrag.experimental.pipeline.pipeline

#  Copyright (c) "Neo4j"
#  Neo4j Sweden AB [https://neo4j.com]
#  #
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  #
#      https://www.apache.org/licenses/LICENSE-2.0
#  #
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from __future__ import annotations

import logging
import warnings
from collections import defaultdict
from timeit import default_timer
from typing import Any, Optional, AsyncGenerator
import asyncio

from neo4j_graphrag.utils.logging import prettify

try:
    from neo4j_viz import Node, Relationship, VisualizationGraph

    neo4j_viz_available = True
except ImportError:
    neo4j_viz_available = False

from pydantic import BaseModel

from neo4j_graphrag.experimental.pipeline.component import Component
from neo4j_graphrag.experimental.pipeline.exceptions import (
    PipelineDefinitionError,
)
from neo4j_graphrag.experimental.pipeline.orchestrator import Orchestrator
from neo4j_graphrag.experimental.pipeline.pipeline_graph import (
    PipelineEdge,
    PipelineGraph,
    PipelineNode,
)
from neo4j_graphrag.experimental.pipeline.stores import InMemoryStore, ResultStore
from neo4j_graphrag.experimental.pipeline.types.definitions import (
    ComponentDefinition,
    ConnectionDefinition,
    PipelineDefinition,
)
from neo4j_graphrag.experimental.pipeline.types.orchestration import RunResult
from neo4j_graphrag.experimental.pipeline.types.context import RunContext
from neo4j_graphrag.experimental.pipeline.notification import (
    EventCallbackProtocol,
    Event,
    PipelineEvent,
    EventType,
)


logger = logging.getLogger(__name__)


class TaskPipelineNode(PipelineNode):
    """Runnable node. It must have:
    - a name (unique within the pipeline)
    - a component instance
    """

    def __init__(self, name: str, component: Component):
        """TaskPipelineNode is a graph node with a run method.

        Args:
            name (str): node's name
            component (Component): component instance
        """
        super().__init__(name, {})
        self.component = component

    async def execute(
        self, context: RunContext, inputs: dict[str, Any]
    ) -> RunResult | None:
        """Execute the task

        Returns:
            RunResult | None: RunResult with status and result dict
            if the task run successfully, None if the status update
            was unsuccessful.
        """
        component_result = await self.component.run_with_context(
            context_=context, **inputs
        )
        run_result = RunResult(
            result=component_result,
        )
        return run_result

    async def run(
        self, context: RunContext, inputs: dict[str, Any]
    ) -> RunResult | None:
        """Main method to execute the task."""
        logger.debug(f"TASK START {self.name=} input={prettify(inputs)}")
        start_time = default_timer()
        res = await self.execute(context, inputs)
        end_time = default_timer()
        logger.debug(
            f"TASK FINISHED {self.name} in {end_time - start_time} res={prettify(res)}"
        )
        return res


class PipelineResult(BaseModel):
    run_id: str
    result: Any


[docs] class Pipeline(PipelineGraph[TaskPipelineNode, PipelineEdge]): """This is the main pipeline, where components and their execution order are defined""" def __init__( self, store: Optional[ResultStore] = None, callback: Optional[EventCallbackProtocol] = None, ) -> None: super().__init__() self.store = store or InMemoryStore() self.callbacks = [callback] if callback else [] self.final_results = InMemoryStore() self.is_validated = False self.param_mapping: dict[str, dict[str, dict[str, str]]] = defaultdict(dict) """ Dict structure: { component_name : { param_name: { component: "", # source component name param_name: "", } } } """ self.missing_inputs: dict[str, list[str]] = defaultdict() @classmethod def from_template( cls, pipeline_template: PipelineDefinition, store: Optional[ResultStore] = None ) -> Pipeline: warnings.warn( "from_template is deprecated, use from_definition instead", DeprecationWarning, stacklevel=2, ) return cls.from_definition(pipeline_template, store) @classmethod def from_definition( cls, pipeline_definition: PipelineDefinition, store: Optional[ResultStore] = None, ) -> Pipeline: """Create a Pipeline from a pydantic model defining the components and their connections Args: pipeline_definition (PipelineDefinition): An object defining components and how they are connected to each other. store (Optional[ResultStore]): Where the results are stored. By default, uses the InMemoryStore. """ pipeline = Pipeline(store=store) for component in pipeline_definition.components: pipeline.add_component( component.component, component.name, ) for edge in pipeline_definition.connections: pipeline_edge = PipelineEdge( edge.start, edge.end, data={"input_config": edge.input_config} ) pipeline.add_edge(pipeline_edge) return pipeline def show_as_dict(self) -> dict[str, Any]: component_config = [] for name, task in self._nodes.items(): component_config.append( ComponentDefinition(name=name, component=task.component) ) connection_config = [] for edge in self._edges: connection_config.append( ConnectionDefinition( start=edge.start, end=edge.end, input_config=edge.data["input_config"] if edge.data else {}, ) ) pipeline_config = PipelineDefinition( components=component_config, connections=connection_config ) return pipeline_config.model_dump()
[docs] def draw( self, path: str, layout: str = "dot", hide_unused_outputs: bool = True ) -> Any: """Render the pipeline graph to an HTML file at the specified path""" G = self._get_neo4j_viz_graph(hide_unused_outputs) # Write the visualization to an HTML file with open(path, "w") as f: f.write(G.render().data) return G
def _get_neo4j_viz_graph( self, hide_unused_outputs: bool = True ) -> VisualizationGraph: """Generate a neo4j-viz visualization of the pipeline graph""" if not neo4j_viz_available: raise ImportError( "Could not import neo4j-viz. Install it with 'pip install \"neo4j-graphrag[experimental]\"'" ) self.validate_parameter_mapping() nodes = [] relationships = [] node_ids = {} # Map node names to their numeric IDs next_id = 0 # Create nodes for each component for n, pipeline_node in self._nodes.items(): comp_inputs = ", ".join( f"{i}: {d['annotation']}" for i, d in pipeline_node.component.component_inputs.items() ) node_ids[n] = next_id label = f"{pipeline_node.component.__class__.__name__}: {n}({comp_inputs})" # Create Node with properties parameter viz_node = Node( # type: ignore id=next_id, caption=label, size=20, properties={"node_type": "component"}, ) nodes.append(viz_node) next_id += 1 # Create nodes for each output field for o in pipeline_node.component.component_outputs: param_node_name = f"{n}.{o}" # Skip if we're hiding unused outputs and it's not used if hide_unused_outputs: # Check if this output is used as a source in any parameter mapping is_used = False for params in self.param_mapping.values(): for mapping in params.values(): source_component = mapping["component"] source_param_name = mapping.get("param") if source_component == n and source_param_name == o: is_used = True break if is_used: break if not is_used: continue node_ids[param_node_name] = next_id # Create Node with properties parameter output_node = Node( # type: ignore id=next_id, caption=o, size=15, properties={"node_type": "output"}, ) nodes.append(output_node) # Connect component to its output # Connect component to its output rel = Relationship( # type: ignore source=node_ids[n], target=node_ids[param_node_name], properties={"type": "HAS_OUTPUT"}, ) relationships.append(rel) next_id += 1 # Create edges between components based on parameter mapping for component_name, params in self.param_mapping.items(): for param, mapping in params.items(): source_component = mapping["component"] source_param_name = mapping.get("param") if source_param_name: source_output_node = f"{source_component}.{source_param_name}" else: source_output_node = source_component if source_output_node in node_ids and component_name in node_ids: rel = Relationship( # type: ignore source=node_ids[source_output_node], target=node_ids[component_name], caption=param, properties={"type": "CONNECTS_TO"}, ) relationships.append(rel) # Create the visualization graph viz_graph = VisualizationGraph(nodes=nodes, relationships=relationships) return viz_graph def get_pygraphviz_graph(self, hide_unused_outputs: bool = True) -> Any: """Legacy method for backward compatibility. Uses neo4j-viz instead of pygraphviz. """ warnings.warn( "get_pygraphviz_graph is deprecated, use draw instead", DeprecationWarning, stacklevel=2, ) return self._get_neo4j_viz_graph(hide_unused_outputs)
[docs] def add_component(self, component: Component, name: str) -> None: """Add a new component. Components are uniquely identified by their name. If 'name' is already in the pipeline, a ValueError is raised.""" task = TaskPipelineNode(name, component) self.add_node(task) # invalidate the pipeline if it was already validated self.invalidate()
def set_component(self, name: str, component: Component) -> None: """Replace a component with another. If 'name' is not yet in the pipeline, raises ValueError. """ task = TaskPipelineNode(name, component) self.set_node(task) # invalidate the pipeline if it was already validated self.invalidate()
[docs] def connect( self, start_component_name: str, end_component_name: str, input_config: Optional[dict[str, str]] = None, ) -> None: """Connect one component to another. Args: start_component_name (str): name of the component as defined in the add_component method end_component_name (str): name of the component as defined in the add_component method input_config (Optional[dict[str, str]]): end component input configuration: propagate previous components outputs. Raises: PipelineDefinitionError: if the provided component are not in the Pipeline or if the graph that would be created by this connection is cyclic. """ edge = PipelineEdge( start_component_name, end_component_name, data={"input_config": input_config}, ) try: self.add_edge(edge) except KeyError: raise PipelineDefinitionError( f"{start_component_name} or {end_component_name} is not in the Pipeline" ) if self.is_cyclic(): raise PipelineDefinitionError("Cyclic graph are not allowed") # invalidate the pipeline if it was already validated self.invalidate()
def invalidate(self) -> None: self.is_validated = False self.param_mapping = defaultdict(dict) self.missing_inputs = defaultdict() def validate_parameter_mapping(self) -> None: """Go through the graph and make sure parameter mapping is valid (without considering user input yet) """ if self.is_validated: return for task in self._nodes.values(): self.validate_parameter_mapping_for_task(task) self.is_validated = True def validate_input_data(self, data: dict[str, Any]) -> bool: """Performs parameter and data validation before running the pipeline: - Check parameters defined in the connect method - Make sure the missing parameters are present in the input `data` dict. Args: data (dict[str, Any]): input data to use for validation (usually from Pipeline.run) Raises: PipelineDefinitionError if any parameter mapping is invalid or if a parameter is missing. """ if not self.is_validated: self.validate_parameter_mapping() for task in self._nodes.values(): if task.name not in self.param_mapping: self.validate_parameter_mapping_for_task(task) missing_params = self.missing_inputs[task.name] task_data = data.get(task.name) or {} for param in missing_params: if param not in task_data: raise PipelineDefinitionError( f"Parameter '{param}' not provided for component '{task.name}'" ) return True def validate_parameter_mapping_for_task(self, task: TaskPipelineNode) -> bool: """Make sure that all the parameter mapping for a given task are valid. Does not consider user input yet. Considering the naming {param => target (component, [output_parameter]) }, the mapping is valid if: - 'param' is a valid input for task - 'param' has not already been mapped - The target component exists in the pipeline and, if specified, the target output parameter is a valid field in the target component's result model. This method builds the param_mapping and missing_inputs instance variables. """ component = task.component expected_mandatory_inputs = [ param_name for param_name, config in component.component_inputs.items() if config["has_default"] is False ] # start building the actual input list, starting # from the inputs provided in the pipeline.run method actual_inputs = [] prev_edges = self.previous_edges(task.name) # then, iterate over all parents to find the parameter propagation for edge in prev_edges: edge_data = edge.data or {} edge_inputs = edge_data.get("input_config") or {} # check that the previous component is actually returning # the mapped parameter for param, path in edge_inputs.items(): if param in self.param_mapping[task.name]: raise PipelineDefinitionError( f"Parameter '{param}' already mapped to {self.param_mapping[task.name][param]}" ) if param not in task.component.component_inputs: raise PipelineDefinitionError( f"Parameter '{param}' is not a valid input for component '{task.name}' of type '{task.component.__class__.__name__}'" ) try: source_component_name, param_name = path.split(".") except ValueError: # no specific output mapped # the full source component result will be # passed to the next component self.param_mapping[task.name][param] = { "component": path, } continue try: source_node = self.get_node_by_name(source_component_name) except KeyError: raise PipelineDefinitionError( f"Component {source_component_name} does not exist in the pipeline," f" can not map {param} to {path} for {task.name}." ) source_component = source_node.component source_component_outputs = source_component.component_outputs if param_name and param_name not in source_component_outputs: raise PipelineDefinitionError( f"Parameter {param_name} is not valid output for " f"{source_component_name} (must be one of " f"{list(source_component_outputs.keys())})" ) self.param_mapping[task.name][param] = { "component": source_component_name, "param": param_name, } actual_inputs.extend(list(edge_inputs.keys())) missing_inputs = list(set(expected_mandatory_inputs) - set(actual_inputs)) self.missing_inputs[task.name] = missing_inputs return True async def get_final_results(self, run_id: str) -> dict[str, Any]: return await self.final_results.get(run_id) # type: ignore[no-any-return] async def stream( self, data: dict[str, Any], raise_exception: bool = True ) -> AsyncGenerator[Event, None]: """Run the pipeline and stream events for task progress. Args: data (dict): Input data for the pipeline components raise_exception (bool): set to False to prevent this task from propagating Pipeline exceptions. Yields: Event: Pipeline and task events including start, progress, and completion """ # Create queue for events event_queue: asyncio.Queue[Event] = asyncio.Queue() run_id = None async def event_stream(event: Event) -> None: # Put event in queue for streaming await event_queue.put(event) # Add event streaming callback self.callbacks.append(event_stream) event_queue_getter_task = None try: # Start pipeline execution in background task run_task = asyncio.create_task(self.run(data)) # loop until the run task is done, and we do not have # any more pending tasks in queue is_run_task_running = True is_queue_empty = False while is_run_task_running or not is_queue_empty: # Wait for next event or pipeline completion event_queue_getter_task = asyncio.create_task(event_queue.get()) done, pending = await asyncio.wait( [run_task, event_queue_getter_task], return_when=asyncio.FIRST_COMPLETED, ) is_run_task_running = run_task not in done is_queue_empty = event_queue.empty() for event_future in done: if event_future == run_task: continue # we are sure to get an Event here, since this is the only # thing we put in the queue, but mypy still complains event = event_future.result() run_id = getattr(event, "run_id", None) yield event # type: ignore if exc := run_task.exception(): yield PipelineEvent( event_type=EventType.PIPELINE_FAILED, # run_id is null if pipeline fails before even starting # ie during pipeline validation run_id=run_id or "", message=str(exc), ) if raise_exception: raise exc finally: # Restore original callback self.callbacks.remove(event_stream) if event_queue_getter_task and not event_queue_getter_task.done(): event_queue_getter_task.cancel()
[docs] async def run(self, data: dict[str, Any]) -> PipelineResult: logger.debug("PIPELINE START") start_time = default_timer() self.invalidate() self.validate_input_data(data) orchestrator = Orchestrator(self) logger.debug(f"PIPELINE ORCHESTRATOR: {orchestrator.run_id}") await orchestrator.run(data) end_time = default_timer() logger.debug( f"PIPELINE FINISHED {orchestrator.run_id} in {end_time - start_time}s" ) return PipelineResult( run_id=orchestrator.run_id, result=await self.get_final_results(orchestrator.run_id), )