Skip to content

Commit

Permalink
New chunking classes (#996)
Browse files Browse the repository at this point in the history
The original chunkers ported from SK had some bugs introduced while
refactoring, leading to incorrect split. This is a full rewrite
following the original logic, with some changes:
- remove `MaxTokensPerLine` setting
- overlap doesn't use sentences anymore, and copy raw tokens from the
previous chunk instead
- markdown chunker uses better splitting logic, although it should be
rewritten to use a markdown parser
- chunkers now work with a Chunk class which is used also by the file
parsers. This will allow to port properties from files to chunks, such
as page number and other metadata
- chunkers now take a dependency on tokenizers directly, rather than
just TokenCount
- chunkers are now out of Core and into a dedicated nuget, for future
reuse outside KM
  • Loading branch information
dluc authored Feb 6, 2025
1 parent 2681a9b commit a490102
Show file tree
Hide file tree
Showing 51 changed files with 69,613 additions and 1,412 deletions.
3 changes: 2 additions & 1 deletion .github/_typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ extend-exclude = [
"appsettings.Development.json",
"appsettings.*.json.*",
"AzureAISearchFilteringTest.cs",
"KernelMemory.sln.DotSettings"
"KernelMemory.sln.DotSettings",
"doc1.txt",
]

[default.extend-words]
Expand Down
13 changes: 13 additions & 0 deletions KernelMemory.sln
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "images", "images", "{B7CC5E
infra\images\Pip.png = infra\images\Pip.png
EndProjectSection
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Chunkers", "extensions\Chunkers\Chunkers\Chunkers.csproj", "{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Chunkers.UnitTests", "extensions\Chunkers\Chunkers.UnitTests\Chunkers.UnitTests.csproj", "{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -709,6 +713,13 @@ Global
{41A5A076-B35D-4191-B98C-65AD5782A108}.Debug|Any CPU.Build.0 = Debug|Any CPU
{41A5A076-B35D-4191-B98C-65AD5782A108}.Release|Any CPU.ActiveCfg = Release|Any CPU
{41A5A076-B35D-4191-B98C-65AD5782A108}.Release|Any CPU.Build.0 = Release|Any CPU
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Debug|Any CPU.Build.0 = Debug|Any CPU
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Release|Any CPU.ActiveCfg = Release|Any CPU
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB}.Release|Any CPU.Build.0 = Release|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Debug|Any CPU.Build.0 = Debug|Any CPU
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226}.Release|Any CPU.ActiveCfg = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -826,6 +837,8 @@ Global
{B8858AB4-5CB9-4CD8-A6A0-12847F792FF2} = {C2D3A947-B6F9-4306-BD42-21D8D1F42750}
{237B22CA-B757-43DF-9A0B-18DE7F4DA123} = {B488168B-AD86-4CC5-9D89-324B6EB743D9}
{B7CC5E82-AD91-488F-8C05-1ECD767D4A10} = {B488168B-AD86-4CC5-9D89-324B6EB743D9}
{BFF9BE1A-B0E4-4ABE-B384-01B200D4FEFB} = {155DA079-E267-49AF-973A-D1D44681970F}
{FD1EB2C1-581E-4EB8-AF4A-BC4773453226} = {3C17F42B-CFC8-4900-8CFB-88936311E919}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {CC136C62-115C-41D1-B414-F9473EFF6EA8}
Expand Down
2 changes: 2 additions & 0 deletions KernelMemory.sln.DotSettings
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EMigrateThisQualifierSettings/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EPsi_002ECSharp_002ECodeStyle_002ESettingsUpgrade_002EPredefinedNamingRulesToUserRulesUpgrade/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EUnitTestFramework_002ESettings_002EMigrations_002ERemoveBuildPolicyAlwaysMigration/@EntryIndexedValue">True</s:Boolean>
<s:String x:Key="/Default/Environment/UnitTesting/XunitProvider/TestDiscoveryFromArtifactsMethod/@EntryValue">TestRunner</s:String>
<s:Boolean x:Key="/Default/Housekeeping/Layout/SolBuilderDuoView/ShowBuildProgressInToolWindow/@EntryValue">False</s:Boolean>
<s:String x:Key="/Default/Housekeeping/UnitTestingMru/UnitTestSessionDefault/LogSeverity/@EntryValue">TRACE</s:String>
<s:Int64 x:Key="/Default/Housekeeping/UnitTestingMru/UnitTestSessionDefault/OutputLineNumberLimit/@EntryValue">8201</s:Int64>
Expand Down Expand Up @@ -246,6 +247,7 @@ public void It$SOMENAME$()
<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREBLOBS/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREIDENTITY/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=AZUREQUEUE/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=chunkers/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=CONNECTIONSTRING/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=daa/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=appsettings/@EntryIndexedValue">True</s:Boolean>
Expand Down
17 changes: 7 additions & 10 deletions docs/how-to/custom-partitioning.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,18 @@ which uses settings defined in

The handler performs the following steps:

1. **Split text into lines**: If a line is too long, it stops and starts a new line.
2. **Form paragraphs**: Concatenate consecutive lines together up to a maximum paragraph size.
3. **Overlap**: When starting a new paragraph, retain a certain number of lines from the previous paragraph.
1. **Split text into chunks**
2. **Form paragraphs**: Concatenate consecutive chunks together up to a maximum chunk size.
3. **Overlap**: When starting a new chunk, retain a certain number of chunk from the previous chunk.

## Default Settings

The default values used by `TextPartitioningHandler` are:

| Setting | Value | Min | Max |
|------------------|-----------------|-----|------------------------|
| Paragraph length | 1000 tokens max | 1 | depends on the LLM |
| Line length | 300 tokens max | 1 | [paragraph length] |
| Overlap | 100 tokens | 0 | [paragraph length - 1] |
| Setting | Value | Min | Max |
|--------------|-----------------|-----|--------------------|
| Chunk length | 1000 tokens max | 1 | depends on the LLM |
| Overlap | 100 tokens | 0 | [chunk length - 1] |

Lengths are expressed in tokens, which depend on the large language model (LLM) in use and its
tokenization logic. KernelMemoryBuilder allows specifying a custom tokenizer for each LLM during setup.
Expand Down Expand Up @@ -59,7 +58,6 @@ For example, with small models supporting up to 256 tokens, something like this
...
"TextPartitioning": {
"MaxTokensPerParagraph": 256,
"MaxTokensPerLine": 256,
"OverlappingTokens": 50
},
...
Expand All @@ -74,7 +72,6 @@ var memory = new KernelMemoryBuilder()
new TextPartitioningOptions
{
MaxTokensPerParagraph = 256,
MaxTokensPerLine = 256,
OverlappingTokens = 50
})
.Build<MemoryServerless>();
Expand Down
6 changes: 2 additions & 4 deletions examples/102-dotnet-custom-partitioning-options/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,9 @@
.WithOpenAIDefaults(Environment.GetEnvironmentVariable("OPENAI_API_KEY")!)
.WithCustomTextPartitioningOptions(new TextPartitioningOptions
{
// Max 99 tokens per sentence
MaxTokensPerLine = 99,
// When sentences are merged into paragraphs (aka partitions), stop at 299 tokens
// When splitting text into chunks (aka partitions), stop at 299 tokens
MaxTokensPerParagraph = 299,
// Each paragraph contains the last 47 tokens from the previous one
// Each chunk contains the last 47 tokens from the previous one
OverlappingTokens = 47,
})
.Build<MemoryServerless>();
Expand Down
2 changes: 1 addition & 1 deletion examples/108-dotnet-custom-content-decoders/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
foreach (Page? page in pdfDocument.GetPages().Where(x => x != null))
{
string pageContent = (ContentOrderTextExtractor.GetText(page, options) ?? string.Empty).ReplaceLineEndings(" ");
result.Sections.Add(new FileSection(page.Number, pageContent, false));
result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
}

return Task.FromResult(result);
Expand Down
10 changes: 5 additions & 5 deletions examples/205-dotnet-extract-text-from-docs/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
var msWordDecoder = new MsWordDecoder();
content = await msWordDecoder.DecodeAsync("mswordfile.docx");

foreach (FileSection section in content.Sections)
foreach (Chunk section in content.Sections)
{
Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
Console.WriteLine(section.Content);
Expand All @@ -36,7 +36,7 @@
var msPowerPointDecoder = new MsPowerPointDecoder();
content = await msPowerPointDecoder.DecodeAsync("mspowerpointfile.pptx");

foreach (FileSection section in content.Sections)
foreach (Chunk section in content.Sections)
{
Console.WriteLine($"Slide: {section.Number}/{content.Sections.Count}");
Console.WriteLine(section.Content);
Expand All @@ -56,7 +56,7 @@
var msExcelDecoder = new MsExcelDecoder();
content = await msExcelDecoder.DecodeAsync("msexcelfile.xlsx");

foreach (FileSection section in content.Sections)
foreach (Chunk section in content.Sections)
{
Console.WriteLine($"Worksheet: {section.Number}/{content.Sections.Count}");
Console.WriteLine(section.Content);
Expand All @@ -76,7 +76,7 @@
var pdfDecoder = new PdfDecoder();
content = await pdfDecoder.DecodeAsync("file1.pdf");

foreach (FileSection section in content.Sections)
foreach (Chunk section in content.Sections)
{
Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
Console.WriteLine(section.Content);
Expand All @@ -95,7 +95,7 @@

content = await pdfDecoder.DecodeAsync("file2.pdf");

foreach (FileSection section in content.Sections)
foreach (Chunk section in content.Sections)
{
Console.WriteLine($"Page: {section.Number}/{content.Sections.Count}");
Console.WriteLine(section.Content);
Expand Down
5 changes: 2 additions & 3 deletions examples/207-dotnet-expanding-chunks-on-retrieval/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public static class Program
public static async Task Main()
{
// Partition input text in chunks of 100 tokens
const int PartitionSize = 100;
const int Chunksize = 100;

// Search settings
const string Query = "astrobiology";
Expand All @@ -59,8 +59,7 @@ public static async Task Main()
// Customize memory records size (in tokens)
var textPartitioningOptions = new TextPartitioningOptions
{
MaxTokensPerParagraph = PartitionSize,
MaxTokensPerLine = PartitionSize,
MaxTokensPerParagraph = Chunksize,
OverlappingTokens = 0,
};

Expand Down
11 changes: 4 additions & 7 deletions examples/210-KM-without-builder/appsettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -154,14 +154,11 @@
"ImageOcrType": "None",
// Partitioning / Chunking settings
// How does the partitioning work?
// * Given a document, text is extracted, and text is split in sentences, called "lines of text".
// * Sentences are merged into paragraphs, called "partitions".
// * For each partition, one (potentially more) memory is generated.
// * Given a document, text is extracted, and text is split in tokens.
// * Tokens are merged into chunks, called "partitions", sometimes called "paragraphs"
// * For each chunk, one (potentially more) memory is generated.
"TextPartitioning": {
// Maximum length of lines of text (aka sentences), in tokens. Tokens depend on the LLM in use.
// Sentences are grouped into paragraphs, see the next setting.
"MaxTokensPerLine": 300,
// Maximum length of paragraphs (aka partitions), in tokens. Tokens depend on the LLM in use.
// Maximum length of chunks in tokens. Tokens depend on the LLM in use.
"MaxTokensPerParagraph": 1000,
// How many tokens from a paragraph to keep in the following paragraph.
"OverlappingTokens": 100
Expand Down
44 changes: 44 additions & 0 deletions extensions/Chunkers/Chunkers.UnitTests/Chunkers.UnitTests.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<AssemblyName>Microsoft.Chunkers.UnitTests</AssemblyName>
<RootNamespace>Microsoft.Chunkers.UnitTests</RootNamespace>
<TargetFramework>net8.0</TargetFramework>
<RollForward>LatestMajor</RollForward>
<IsTestProject>true</IsTestProject>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<IsPackable>false</IsPackable>
<NoWarn>xUnit2013;CA1303;KMEXP00;</NoWarn>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.Extensions.DependencyInjection" />
<PackageReference Include="Microsoft.NET.Test.Sdk" />
<PackageReference Include="Xunit.DependencyInjection" />
<PackageReference Include="Xunit.DependencyInjection.Logging" />
<PackageReference Include="xunit" />
<PackageReference Include="xunit.abstractions" />
<PackageReference Include="xunit.runner.visualstudio">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\service\tests\TestHelpers\TestHelpers.csproj" />
<ProjectReference Include="..\Chunkers\Chunkers.csproj" />
</ItemGroup>

<ItemGroup>
<None Remove="doc1.txt" />
<Content Include="doc1.txt">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
<None Remove="doc2.md" />
<Content Include="doc2.md">
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
</Content>
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.AI;

namespace Microsoft.Chunkers.UnitTests.Helpers;

internal sealed class FourCharsTestTokenizer : ITextTokenizer
{
public int CountTokens(string text)
{
return (int)Math.Ceiling(text.Length / 4d);
}

public IReadOnlyList<string> GetTokens(string text)
{
var tokens = new List<string>((text.Length + 3) / 4);

Span<char> buffer = stackalloc char[4];
for (int i = 0; i < text.Length; i += 4)
{
int tokenLength = Math.Min(4, text.Length - i);
for (int j = 0; j < tokenLength; j++)
{
buffer[j] = text[i + j];
}

tokens.Add(new string(buffer.Slice(0, tokenLength)));
}

return tokens;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.AI;

namespace Microsoft.Chunkers.UnitTests.Helpers;

internal sealed class OneCharTestTokenizer : ITextTokenizer
{
public int CountTokens(string text)
{
return text.Length;
}

public IReadOnlyList<string> GetTokens(string text)
{
var tokens = new List<string>(text.Length);
tokens.AddRange(text.Select(t => t.ToString()));
return tokens;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.KernelMemory.AI;

namespace Microsoft.Chunkers.UnitTests.Helpers;

internal sealed class TwoCharsTestTokenizer : ITextTokenizer
{
public int CountTokens(string text)
{
return (int)Math.Ceiling(text.Length / 2d);
}

public IReadOnlyList<string> GetTokens(string text)
{
int length = text.Length;
var tokens = new List<string>(length / 2 + length % 2);

Span<char> buffer = stackalloc char[2];
for (int i = 0; i < length; i += 2)
{
buffer[0] = text[i];
if (i + 1 < length)
{
buffer[1] = text[i + 1];
tokens.Add(new string(buffer));
}
else
{
tokens.Add(text[i].ToString());
}
}

return tokens;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Diagnostics;
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers;
using Microsoft.KM.TestHelpers;
using Xunit;
using Xunit.Abstractions;

namespace Microsoft.Chunkers.UnitTests;

public class MarkDownChunkerManualTest(ITestOutputHelper output) : BaseUnitTestCase(output)
{
[Fact]
[Trait("Category", "UnitTest")]
[Trait("Category", "Chunking")]
[Trait("Category", "Manual")]
public void ItSplitsMarkdownInASensibleWay()
{
// Arrange
string text = File.ReadAllText("doc2.md");
text = $"{text}{text}";

// Act
var w = new Stopwatch();
w.Start();
var chunks = new MarkDownChunker(new CL100KTokenizer()).Split(text, new MarkDownChunkerOptions { MaxTokensPerChunk = 600, Overlap = 60 });
w.Stop();

Console.WriteLine($"Text length: {text.Length:N0} chars");
Console.WriteLine($"Chunks: {chunks.Count}");
Console.WriteLine($"Time: {w.ElapsedMilliseconds:N0} ms");

// Assert
Assert.NotEmpty(chunks);
DebugChunks(chunks, new CL100KTokenizer());
}

private static void DebugChunks(IEnumerable<string> chunks, ITextTokenizer tokenizer)
{
var list = chunks.ToList();

for (int index = 0; index < list.Count; index++)
{
Console.WriteLine($"************************* {index}: [{tokenizer.CountTokens(list[index])} tokens] *****************************************");
Console.WriteLine(list[index]);
Console.WriteLine("***********************************************************************************");
}
}
}
Loading

0 comments on commit a490102

Please sign in to comment.