Source code for neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter

#  Copyright (c) "Neo4j"
#  Neo4j Sweden AB [https://neo4j.com]
#  #
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  #
#      https://www.apache.org/licenses/LICENSE-2.0
#  #
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
from pydantic import validate_call

from neo4j_graphrag.experimental.components.text_splitters.base import TextSplitter
from neo4j_graphrag.experimental.components.types import TextChunk, TextChunks


def _adjust_chunk_start(text: str, approximate_start: int) -> int:
    """
    Shift the starting index backward if it lands in the middle of a word.
    If no whitespace is found, use the proposed start.

     Args:
        text (str): The text being split.
        approximate_start (int): The initial starting index of the chunk.

    Returns:
        int: The adjusted starting index, ensuring the chunk does not begin in the
             middle of a word if possible.
    """
    start = approximate_start
    if start > 0 and not text[start].isspace() and not text[start - 1].isspace():
        while start > 0 and not text[start - 1].isspace():
            start -= 1

        # fallback if no whitespace is found
        if start == 0 and not text[0].isspace():
            start = approximate_start
    return start


def _adjust_chunk_end(text: str, start: int, approximate_end: int) -> int:
    """
    Shift the ending index backward if it lands in the middle of a word.
    If no whitespace is found, use 'approximate_end'.

    Args:
        text (str): The full text being split.
        start (int): The adjusted starting index for this chunk.
        approximate_end (int): The initial end index.

    Returns:
        int: The adjusted ending index, ensuring the chunk does not end in the middle of
            a word if possible.
    """
    end = approximate_end
    if end < len(text):
        while end > start and not text[end].isspace() and not text[end - 1].isspace():
            end -= 1

        # fallback if no whitespace is found
        if end == start:
            end = approximate_end
    return end


[docs] class FixedSizeSplitter(TextSplitter): """Text splitter which splits the input text into fixed or approximate fixed size chunks with optional overlap. Args: chunk_size (int): The number of characters in each chunk. chunk_overlap (int): The number of characters from the previous chunk to overlap with each chunk. Must be less than `chunk_size`. approximate (bool): If True, avoids splitting words in the middle at chunk boundaries. Defaults to True. Example: .. code-block:: python from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter from neo4j_graphrag.experimental.pipeline import Pipeline pipeline = Pipeline() text_splitter = FixedSizeSplitter(chunk_size=4000, chunk_overlap=200, approximate=True) pipeline.add_component(text_splitter, "text_splitter") """ @validate_call def __init__( self, chunk_size: int = 4000, chunk_overlap: int = 200, approximate: bool = True ) -> None: if chunk_size <= 0: raise ValueError("chunk_size must be strictly greater than 0") if chunk_overlap >= chunk_size: raise ValueError("chunk_overlap must be strictly less than chunk_size") self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.approximate = approximate
[docs] @validate_call async def run(self, text: str) -> TextChunks: """Splits a piece of text into chunks. Args: text (str): The text to be split. Returns: TextChunks: A list of chunks. """ chunks = [] index = 0 step = self.chunk_size - self.chunk_overlap text_length = len(text) approximate_start = 0 skip_adjust_chunk_start = False end = 0 while end < text_length: if self.approximate: start = ( approximate_start if skip_adjust_chunk_start else _adjust_chunk_start(text, approximate_start) ) # adjust start and end to avoid cutting words in the middle approximate_end = min(start + self.chunk_size, text_length) end = _adjust_chunk_end(text, start, approximate_end) # when avoiding splitting words in the middle is not possible, revert to # initial chunk end and skip adjusting next chunk start skip_adjust_chunk_start = end == approximate_end else: # apply fixed size splitting with possibly words cut in half at chunk # boundaries start = approximate_start end = min(start + self.chunk_size, text_length) chunk_text = text[start:end] chunks.append(TextChunk(text=chunk_text, index=index)) index += 1 approximate_start = start + step return TextChunks(chunks=chunks)