New chunking classes (#996)

The original chunkers ported from SK had some bugs introduced while refactoring, leading to incorrect split. This is a full rewrite following the original logic, with some changes: - remove `MaxTokensPerLine` setting - overlap doesn't use sentences anymore, and copy raw tokens from the previous chunk instead - markdown chunker uses better splitting logic, although it should be rewritten to use a markdown parser - chunkers now work with a Chunk class which is used also by the file parsers. This will allow to port properties from files to chunks, such as page number and other metadata - chunkers now take a dependency on tokenizers directly, rather than just TokenCount - chunkers are now out of Core and into a dedicated nuget, for future reuse outside KM
microsoft · Feb 6, 2025 · a490102 · a490102
1 parent 2681a9b
commit a490102
Show file tree

Hide file tree

Showing 51 changed files with 69,613 additions and 1,412 deletions.
diff --git a/.github/_typos.toml b/.github/_typos.toml
@@ -17,7 +17,8 @@ extend-exclude = [
     "appsettings.Development.json",
     "appsettings.*.json.*",
     "AzureAISearchFilteringTest.cs",
-    "KernelMemory.sln.DotSettings"
+    "KernelMemory.sln.DotSettings",
+    "doc1.txt",
 ]
 
 [default.extend-words]

diff --git a/KernelMemory.sln b/KernelMemory.sln
@@ -404,6 +404,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "images", "images", "{B7CC5E
 		infra\images\Pip.png = infra\images\Pip.png
 	EndProjectSection
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Chunkers", "extensions\Chunkers\Chunkers\Chunkers.csproj", "{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Chunkers.UnitTests", "extensions\Chunkers\Chunkers.UnitTests\Chunkers.UnitTests.csproj", "{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -709,6 +713,13 @@ Global
 		{41A5A076-B35D-4191-B98C-65AD5782A108}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{41A5A076-B35D-4191-B98C-65AD5782A108}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{41A5A076-B35D-4191-B98C-65AD5782A108}.Release|Any CPU.Build.0 = Release|Any CPU
+		{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Release|Any CPU.Build.0 = Release|Any CPU
+		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -826,6 +837,8 @@ Global
 		{B8858AB4-5CB9-4CD8-A6A0-12847F792FF2} = {C2D3A947-B6F9-4306-BD42-21D8D1F42750}
 		{237B22CA-B757-43DF-9A0B-18DE7F4DA123} = {B488168B-AD86-4CC5-9D89-324B6EB743D9}
 		{B7CC5E82-AD91-488F-8C05-1ECD767D4A10} = {B488168B-AD86-4CC5-9D89-324B6EB743D9}
+		{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB} = {155DA079-E267-49AF-973A-D1D44681970F}
+		{FD1EB2C1-581E-4EB8-AF4A-BC4773453226} = {3C17F42B-CFC8-4900-8CFB-88936311E919}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {CC136C62-115C-41D1-B414-F9473EFF6EA8}

diff --git a/KernelMemory.sln.DotSettings b/KernelMemory.sln.DotSettings
@@ -186,6 +186,7 @@
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EPredefinedNamingRulesToUserRulesUpgrade/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EUnitTestFramework_002ESettings_002EMigrations_002ERemoveBuildPolicyAlwaysMigration/@EntryIndexedValue">True</s:Boolean>
+	<s:String x:Key="/Default/Environment/UnitTesting/XunitProvider/TestDiscoveryFromArtifactsMethod/@EntryValue">TestRunner</s:String>
 	<s:Boolean x:Key="/Default/Housekeeping/Layout/SolBuilderDuoView/ShowBuildProgressInToolWindow/@EntryValue">False</s:Boolean>
 	<s:String x:Key="/Default/Housekeeping/UnitTestingMru/UnitTestSessionDefault/LogSeverity/@EntryValue">TRACE</s:String>
 	<s:Int64 x:Key="/Default/Housekeeping/UnitTestingMru/UnitTestSessionDefault/OutputLineNumberLimit/@EntryValue">8201</s:Int64>
@@ -246,6 +247,7 @@ public void It$SOMENAME$()
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREBLOBS/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREIDENTITY/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREQUEUE/@EntryIndexedValue">True</s:Boolean>
+	<s:Boolean x:Key="/Default/UserDictionary/Words/=chunkers/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=CONNECTIONSTRING/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=daa/@EntryIndexedValue">True</s:Boolean>
 	<s:Boolean x:Key="/Default/UserDictionary/Words/=appsettings/@EntryIndexedValue">True</s:Boolean>

diff --git a/docs/how-to/custom-partitioning.md b/docs/how-to/custom-partitioning.md
@@ -19,19 +19,18 @@ which uses settings defined in
 
 The handler performs the following steps:
 
-1. **Split text into lines**: If a line is too long, it stops and starts a new line.
-2. **Form paragraphs**: Concatenate consecutive lines together up to a maximum paragraph size.
-3. **Overlap**: When starting a new paragraph, retain a certain number of lines from the previous paragraph.
+1. **Split text into chunks**
+2. **Form paragraphs**: Concatenate consecutive chunks together up to a maximum chunk size.
+3. **Overlap**: When starting a new chunk, retain a certain number of chunk from the previous chunk.
 
 ## Default Settings
 
 The default values used by `TextPartitioningHandler` are:
 
-| Setting          | Value           | Min | Max                    |
-|------------------|-----------------|-----|------------------------|
-| Paragraph length | 1000 tokens max |  1  | depends on the LLM     |
-| Line length      | 300 tokens max  |  1  | [paragraph length]     |
-| Overlap          | 100 tokens      |  0  | [paragraph length - 1] |
+| Setting      | Value           | Min | Max                |
+|--------------|-----------------|-----|--------------------|
+| Chunk length | 1000 tokens max |  1  | depends on the LLM |
+| Overlap      | 100 tokens      |  0  | [chunk length - 1] |
 
 Lengths are expressed in tokens, which depend on the large language model (LLM) in use and its
 tokenization logic. KernelMemoryBuilder allows specifying a custom tokenizer for each LLM during setup.
@@ -59,7 +58,6 @@ For example, with small models supporting up to 256 tokens, something like this
       ...
       "TextPartitioning": {
         "MaxTokensPerParagraph": 256,
-        "MaxTokensPerLine": 256,
         "OverlappingTokens": 50
       },
   ...
@@ -74,7 +72,6 @@ var memory = new KernelMemoryBuilder()
         new TextPartitioningOptions
         {
             MaxTokensPerParagraph = 256,
-            MaxTokensPerLine = 256,
             OverlappingTokens = 50
         })
     .Build<MemoryServerless>();

diff --git a/examples/102-dotnet-custom-partitioning-options/Program.cs b/examples/102-dotnet-custom-partitioning-options/Program.cs
@@ -7,11 +7,9 @@
     .WithOpenAIDefaults(Environment.GetEnvironmentVariable("OPENAI_API_KEY")!)
     .WithCustomTextPartitioningOptions(new TextPartitioningOptions
     {
-        // Max 99 tokens per sentence
-        MaxTokensPerLine = 99,
-        // When sentences are merged into paragraphs (aka partitions), stop at 299 tokens
+        // When splitting text into chunks (aka partitions), stop at 299 tokens
         MaxTokensPerParagraph = 299,
-        // Each paragraph contains the last 47 tokens from the previous one
+        // Each chunk contains the last 47 tokens from the previous one
         OverlappingTokens = 47,
     })
     .Build<MemoryServerless>();

diff --git a/examples/108-dotnet-custom-content-decoders/Program.cs b/examples/108-dotnet-custom-content-decoders/Program.cs
@@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
         foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
         {
             string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");
-            result.Sections.Add(new FileSection(page.Number, pageContent, false));
+            result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
         }
 
         return Task.FromResult(result);

diff --git a/examples/205-dotnet-extract-text-from-docs/Program.cs b/examples/205-dotnet-extract-text-from-docs/Program.cs
@@ -16,7 +16,7 @@
 var msWordDecoder = new MsWordDecoder();
 content = await msWordDecoder.DecodeAsync("mswordfile.docx");
 
-foreach (FileSection section in content.Sections)
+foreach (Chunk section in content.Sections)
 {
     Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
     Console.WriteLine(section.Content);
@@ -36,7 +36,7 @@
 var msPowerPointDecoder = new MsPowerPointDecoder();
 content = await msPowerPointDecoder.DecodeAsync("mspowerpointfile.pptx");
 
-foreach (FileSection section in content.Sections)
+foreach (Chunk section in content.Sections)
 {
     Console.WriteLine($"Slide: {section.Number}/{content.Sections.Count}");
     Console.WriteLine(section.Content);
@@ -56,7 +56,7 @@
 var msExcelDecoder = new MsExcelDecoder();
 content = await msExcelDecoder.DecodeAsync("msexcelfile.xlsx");
 
-foreach (FileSection section in content.Sections)
+foreach (Chunk section in content.Sections)
 {
     Console.WriteLine($"Worksheet: {section.Number}/{content.Sections.Count}");
     Console.WriteLine(section.Content);
@@ -76,7 +76,7 @@
 var pdfDecoder = new PdfDecoder();
 content = await pdfDecoder.DecodeAsync("file1.pdf");
 
-foreach (FileSection section in content.Sections)
+foreach (Chunk section in content.Sections)
 {
     Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
     Console.WriteLine(section.Content);
@@ -95,7 +95,7 @@
 
 content = await pdfDecoder.DecodeAsync("file2.pdf");
 
-foreach (FileSection section in content.Sections)
+foreach (Chunk section in content.Sections)
 {
     Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
     Console.WriteLine(section.Content);

diff --git a/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs b/examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs
@@ -40,7 +40,7 @@ public static class Program
     public static async Task Main()
     {
         // Partition input text in chunks of 100 tokens
-        const int PartitionSize = 100;
+        const int Chunksize = 100;
 
         // Search settings
         const string Query = "astrobiology";
@@ -59,8 +59,7 @@ public static async Task Main()
         // Customize memory records size (in tokens)
         var textPartitioningOptions = new TextPartitioningOptions
         {
-            MaxTokensPerParagraph = PartitionSize,
-            MaxTokensPerLine = PartitionSize,
+            MaxTokensPerParagraph = Chunksize,
             OverlappingTokens = 0,
         };
 

diff --git a/examples/210-KM-without-builder/appsettings.json b/examples/210-KM-without-builder/appsettings.json
@@ -154,14 +154,11 @@
       "ImageOcrType": "None",
       // Partitioning / Chunking settings
       // How does the partitioning work?
-      // * Given a document, text is extracted, and text is split in sentences, called "lines of text".
-      // * Sentences are merged into paragraphs, called "partitions".
-      // * For each partition, one (potentially more) memory is generated.
+      // * Given a document, text is extracted, and text is split in tokens.
+      // * Tokens are merged into chunks, called "partitions", sometimes called "paragraphs"
+      // * For each chunk, one (potentially more) memory is generated.
       "TextPartitioning": {
-        // Maximum length of lines of text (aka sentences), in tokens. Tokens depend on the LLM in use.
-        // Sentences are grouped into paragraphs, see the next setting.
-        "MaxTokensPerLine": 300,
-        // Maximum length of paragraphs (aka partitions), in tokens. Tokens depend on the LLM in use.
+        // Maximum length of chunks in tokens. Tokens depend on the LLM in use.
         "MaxTokensPerParagraph": 1000,
         // How many tokens from a paragraph to keep in the following paragraph.
         "OverlappingTokens": 100

diff --git a/extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj b/extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj
@@ -0,0 +1,44 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+    <PropertyGroup>
+        <AssemblyName>Microsoft.Chunkers.UnitTests</AssemblyName>
+        <RootNamespace>Microsoft.Chunkers.UnitTests</RootNamespace>
+        <TargetFramework>net8.0</TargetFramework>
+        <RollForward>LatestMajor</RollForward>
+        <IsTestProject>true</IsTestProject>
+        <ImplicitUsings>enable</ImplicitUsings>
+        <Nullable>enable</Nullable>
+        <IsPackable>false</IsPackable>
+        <NoWarn>xUnit2013;CA1303;KMEXP00;</NoWarn>
+    </PropertyGroup>
+
+    <ItemGroup>
+        <PackageReference Include="Microsoft.Extensions.DependencyInjection" />
+        <PackageReference Include="Microsoft.NET.Test.Sdk" />
+        <PackageReference Include="Xunit.DependencyInjection" />
+        <PackageReference Include="Xunit.DependencyInjection.Logging" />
+        <PackageReference Include="xunit" />
+        <PackageReference Include="xunit.abstractions" />
+        <PackageReference Include="xunit.runner.visualstudio">
+            <PrivateAssets>all</PrivateAssets>
+            <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
+        </PackageReference>
+    </ItemGroup>
+
+    <ItemGroup>
+        <ProjectReference Include="..\..\..\service\tests\TestHelpers\TestHelpers.csproj" />
+        <ProjectReference Include="..\Chunkers\Chunkers.csproj" />
+    </ItemGroup>
+
+    <ItemGroup>
+        <None Remove="doc1.txt" />
+        <Content Include="doc1.txt">
+            <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+        </Content>
+        <None Remove="doc2.md" />
+        <Content Include="doc2.md">
+            <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+        </Content>
+    </ItemGroup>
+
+</Project>
diff --git a/extensions/Chunkers/Chunkers.UnitTests/Helpers/FourCharsTestTokenizer.cs b/extensions/Chunkers/Chunkers.UnitTests/Helpers/FourCharsTestTokenizer.cs
@@ -0,0 +1,32 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using Microsoft.KernelMemory.AI;
+
+namespace Microsoft.Chunkers.UnitTests.Helpers;
+
+internal sealed class FourCharsTestTokenizer : ITextTokenizer
+{
+    public int CountTokens(string text)
+    {
+        return (int)Math.Ceiling(text.Length / 4d);
+    }
+
+    public IReadOnlyList<string> GetTokens(string text)
+    {
+        var tokens = new List<string>((text.Length + 3) / 4);
+
+        Span<char> buffer = stackalloc char[4];
+        for (int i = 0; i < text.Length; i += 4)
+        {
+            int tokenLength = Math.Min(4, text.Length - i);
+            for (int j = 0; j < tokenLength; j++)
+            {
+                buffer[j] = text[i + j];
+            }
+
+            tokens.Add(new string(buffer.Slice(0, tokenLength)));
+        }
+
+        return tokens;
+    }
+}
diff --git a/extensions/Chunkers/Chunkers.UnitTests/Helpers/OneCharTestTokenizer.cs b/extensions/Chunkers/Chunkers.UnitTests/Helpers/OneCharTestTokenizer.cs
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using Microsoft.KernelMemory.AI;
+
+namespace Microsoft.Chunkers.UnitTests.Helpers;
+
+internal sealed class OneCharTestTokenizer : ITextTokenizer
+{
+    public int CountTokens(string text)
+    {
+        return text.Length;
+    }
+
+    public IReadOnlyList<string> GetTokens(string text)
+    {
+        var tokens = new List<string>(text.Length);
+        tokens.AddRange(text.Select(t => t.ToString()));
+        return tokens;
+    }
+}
diff --git a/extensions/Chunkers/Chunkers.UnitTests/Helpers/TwoCharsTestTokenizer.cs b/extensions/Chunkers/Chunkers.UnitTests/Helpers/TwoCharsTestTokenizer.cs
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using Microsoft.KernelMemory.AI;
+
+namespace Microsoft.Chunkers.UnitTests.Helpers;
+
+internal sealed class TwoCharsTestTokenizer : ITextTokenizer
+{
+    public int CountTokens(string text)
+    {
+        return (int)Math.Ceiling(text.Length / 2d);
+    }
+
+    public IReadOnlyList<string> GetTokens(string text)
+    {
+        int length = text.Length;
+        var tokens = new List<string>(length / 2 + length % 2);
+
+        Span<char> buffer = stackalloc char[2];
+        for (int i = 0; i < length; i += 2)
+        {
+            buffer[0] = text[i];
+            if (i + 1 < length)
+            {
+                buffer[1] = text[i + 1];
+                tokens.Add(new string(buffer));
+            }
+            else
+            {
+                tokens.Add(text[i].ToString());
+            }
+        }
+
+        return tokens;
+    }
+}
diff --git a/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerManualTest.cs b/extensions/Chunkers/Chunkers.UnitTests/MarkDownChunkerManualTest.cs
@@ -0,0 +1,50 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Diagnostics;
+using Microsoft.KernelMemory.AI;
+using Microsoft.KernelMemory.Chunkers;
+using Microsoft.KM.TestHelpers;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Microsoft.Chunkers.UnitTests;
+
+public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output)
+{
+    [Fact]
+    [Trait("Category", "UnitTest")]
+    [Trait("Category", "Chunking")]
+    [Trait("Category", "Manual")]
+    public void ItSplitsMarkdownInASensibleWay()
+    {
+        // Arrange
+        string text = File.ReadAllText("doc2.md");
+        text = $"{text}{text}";
+
+        // Act
+        var w = new Stopwatch();
+        w.Start();
+        var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 });
+        w.Stop();
+
+        Console.WriteLine($"Text length: {text.Length:N0} chars");
+        Console.WriteLine($"Chunks: {chunks.Count}");
+        Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms");
+
+        // Assert
+        Assert.NotEmpty(chunks);
+        DebugChunks(chunks, new CL100KTokenizer());
+    }
+
+    private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
+    {
+        var list = chunks.ToList();
+
+        for (int index = 0; index < list.Count; index++)
+        {
+            Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************");
+            Console.WriteLine(list[index]);
+            Console.WriteLine("***********************************************************************************");
+        }
+    }
+}