Source code for neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter

#  Copyright (c) "Neo4j"
#  Neo4j Sweden AB [https://neo4j.com]
#  #
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  #
#      https://www.apache.org/licenses/LICENSE-2.0
#  #
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from pydantic import validate_call

from neo4j_graphrag.experimental.components.text_splitters.base import TextSplitter
from neo4j_graphrag.experimental.components.types import TextChunk, TextChunks


def _adjust_chunk_start(text: str, approximate_start: int) -> int:
    """
    Shift the starting index backward if it lands in the middle of a word.
    If no whitespace is found, use the proposed start.

     Args:
        text (str): The text being split.
        approximate_start (int): The initial starting index of the chunk.

    Returns:
        int: The adjusted starting index, ensuring the chunk does not begin in the
             middle of a word if possible.
    """
    start = approximate_start
    if start > 0 and not text[start].isspace() and not text[start - 1].isspace():
        while start > 0 and not text[start - 1].isspace():
            start -= 1

        # fallback if no whitespace is found
        if start == 0 and not text[0].isspace():
            start = approximate_start
    return start


def _adjust_chunk_end(text: str, start: int, approximate_end: int) -> int:
    """
    Shift the ending index backward if it lands in the middle of a word.
    If no whitespace is found, use 'approximate_end'.

    Args:
        text (str): The full text being split.
        start (int): The adjusted starting index for this chunk.
        approximate_end (int): The initial end index.

    Returns:
        int: The adjusted ending index, ensuring the chunk does not end in the middle of
            a word if possible.
    """
    end = approximate_end
    if end < len(text):
        while end > start and not text[end].isspace() and not text[end - 1].isspace():
            end -= 1

        # fallback if no whitespace is found
        if end == start:
            end = approximate_end
    return end



[docs]
class FixedSizeSplitter(TextSplitter):
    """Text splitter which splits the input text into fixed or approximate fixed size
       chunks with optional overlap.

    Args:
        chunk_size (int): The number of characters in each chunk.
        chunk_overlap (int): The number of characters from the previous chunk to overlap
                            with each chunk. Must be less than `chunk_size`.
        approximate (bool): If True, avoids splitting words in the middle at chunk
                            boundaries. Defaults to True.


    Example:

    .. code-block:: python

        from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
        from neo4j_graphrag.experimental.pipeline import Pipeline

        pipeline = Pipeline()
        text_splitter = FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=True)
        pipeline.add_component(text_splitter, "text_splitter")
    """

    @validate_call
    def __init__(
        self, chunk_size: int = 4000, chunk_overlap: int = 200, approximate: bool = True
    ) -> None:
        if chunk_size <= 0:
            raise ValueError("chunk_size must be strictly greater than 0")
        if chunk_overlap >= chunk_size:
            raise ValueError("chunk_overlap must be strictly less than chunk_size")
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.approximate = approximate


[docs]
    @validate_call
    async def run(self, text: str) -> TextChunks:
        """Splits a piece of text into chunks.

        Args:
            text (str): The text to be split.

        Returns:
            TextChunks: A list of chunks.
        """
        chunks = []
        index = 0
        step = self.chunk_size - self.chunk_overlap
        text_length = len(text)
        approximate_start = 0
        skip_adjust_chunk_start = False
        end = 0

        while end < text_length:
            if self.approximate:
                start = (
                    approximate_start
                    if skip_adjust_chunk_start
                    else _adjust_chunk_start(text, approximate_start)
                )
                # adjust start and end to avoid cutting words in the middle
                approximate_end = min(start + self.chunk_size, text_length)
                end = _adjust_chunk_end(text, start, approximate_end)
                # when avoiding splitting words in the middle is not possible, revert to
                # initial chunk end and skip adjusting next chunk start
                skip_adjust_chunk_start = end == approximate_end
            else:
                # apply fixed size splitting with possibly words cut in half at chunk
                # boundaries
                start = approximate_start
                end = min(start + self.chunk_size, text_length)

            chunk_text = text[start:end]
            chunks.append(TextChunk(text=chunk_text, index=index))
            index += 1

            approximate_start = start + step

        return TextChunks(chunks=chunks)