Skip to content

Commit

Permalink
feat: 优化词法分析器匹配性能。
Browse files Browse the repository at this point in the history
  • Loading branch information
CYJB committed Mar 25, 2024
1 parent f08ce93 commit 938c1e8
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 49 deletions.
7 changes: 4 additions & 3 deletions Runtime/Lexers/Core/BasicCore`1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ public override bool NextToken(int state, int start)
{
// 最后一次匹配的符号和文本索引。
int lastAccept = -1, lastIndex = source.Index;
int[] states = data.States;
int symbolStart = 0, symbolEnd = 0;
while (true)
{
state = NextState(state);
Expand All @@ -34,10 +36,9 @@ public override bool NextToken(int state, int start)
// 没有合适的转移,退出。
break;
}
ArraySegment<int> symbols = data.GetSymbols(state);
if (symbols.Count > 0)
if (data.GetSymbols(state, ref symbolStart, ref symbolEnd))
{
lastAccept = symbols[0];
lastAccept = states[symbolStart];
lastIndex = source.Index;
// 使用最短匹配时,可以直接返回。
if (data.UseShortest && data.Terminals[lastAccept].UseShortest)
Expand Down
7 changes: 4 additions & 3 deletions Runtime/Lexers/Core/FixedTrailingCore`1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ public override bool NextToken(int state, int start)
{
// 最后一次匹配的符号和文本索引。
int lastAccept = -1, lastIndex = source.Index;
int[] states = data.States;
int symbolStart = 0, symbolEnd = 0;
while (true)
{
state = NextState(state);
Expand All @@ -34,11 +36,10 @@ public override bool NextToken(int state, int start)
// 没有合适的转移,退出。
break;
}
ArraySegment<int> symbols = data.GetSymbols(state);
// 确定不是向前看的头状态。
if (symbols.Count > 0 && symbols[0] >= 0)
if (data.GetSymbols(state, ref symbolStart, ref symbolEnd) && states[symbolStart] >= 0)
{
lastAccept = symbols[0];
lastAccept = states[symbolStart];
lastIndex = source.Index;
// 使用最短匹配时,可以直接返回。
if (data.UseShortest && data.Terminals[lastAccept].UseShortest)
Expand Down
10 changes: 6 additions & 4 deletions Runtime/Lexers/Core/LexerCore`1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,14 @@ protected int NextState(int state)
/// <summary>
/// 返回指定符号列表中的候选类型。
/// </summary>
/// <param name="states">状态列表。</param>
/// <param name="symbols">要检查的符号列表。</param>
/// <returns><paramref name="symbols"/> 中包含的候选状态。</returns>
protected IEnumerable<T> GetCandidates(ArraySegment<int> symbols)
/// <param name="candidates">候选状态集合。</param>
protected void GetCandidates(int[] states, ValueTuple<int, int> symbols, HashSet<T> candidates)
{
foreach (int acceptState in symbols)
for (int i = symbols.Item1; i < symbols.Item2; i++)
{
int acceptState = states[i];
if (acceptState < 0)
{
// 跳过向前看的头状态。
Expand All @@ -134,7 +136,7 @@ protected IEnumerable<T> GetCandidates(ArraySegment<int> symbols)
var kind = data.Terminals[acceptState].Kind;
if (kind.HasValue)
{
yield return kind.Value;
candidates.Add(kind.Value);
}
}
}
Expand Down
46 changes: 28 additions & 18 deletions Runtime/Lexers/Core/RejectableCore`1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,27 @@ internal sealed class RejectableCore<T> : LexerCore<T>
/// <summary>
/// 接受符号的堆栈。
/// </summary>
private readonly Stack<ArraySegment<int>> symbolStack = new();
private readonly ListStack<ValueTuple<int, int>> symbolStack = new();
/// <summary>
/// 接受索引的堆栈。
/// </summary>
private readonly Stack<int> indexStack = new();
/// <summary>
/// 候选类型。
/// </summary>
private IReadOnlySet<T>? candidates;
private readonly HashSet<T> candidates = new();
/// <summary>
/// 无效的状态列表。
/// </summary>
private readonly HashSet<int> invalidStates = new();
/// <summary>
/// 当前候选符号。
/// </summary>
private ArraySegment<int> curSymbols;
private ValueTuple<int, int> curSymbols;
/// <summary>
/// 是否需要重新计算候选类型。
/// </summary>
private bool isCandidatesValid = false;

/// <summary>
/// 使用给定的词法分析器信息初始化 <see cref="RejectableCore{T}"/> 类的新实例。
Expand All @@ -47,13 +51,17 @@ public override IReadOnlySet<T> Candidates
{
get
{
if (candidates == null)
if (!isCandidatesValid)
{
HashSet<T> result = new();
isCandidatesValid = true;
candidates.Clear();
int[] states = data.States;
// 先添加当前候选
result.UnionWith(GetCandidates(curSymbols));
result.UnionWith(symbolStack.SelectMany(GetCandidates));
candidates = result.AsReadOnly();
GetCandidates(states, curSymbols, candidates);
for (int i = 0; i < symbolStack.Count; i++)
{
GetCandidates(states, symbolStack[i], candidates);
}
}
return candidates;
}
Expand All @@ -69,6 +77,8 @@ public override bool NextToken(int state, int start)
{
symbolStack.Clear();
indexStack.Clear();
int[] states = data.States;
int symbolStart = 0, symbolEnd = 0;
while (true)
{
state = NextState(state);
Expand All @@ -77,19 +87,19 @@ public override bool NextToken(int state, int start)
// 没有合适的转移,退出。
break;
}
ArraySegment<int> symbols = data.GetSymbols(state);
if (symbols.Count > 0)
if (data.GetSymbols(state, ref symbolStart, ref symbolEnd))
{
if (data.UseShortest)
{
// 保存流的索引,避免被误修改影响后续匹配。
int originIndex = source.Index;
// 最短匹配时不需要生成候选列表。
candidates = SetUtil.Empty<T>();
candidates.Clear();
isCandidatesValid = true;
// 使用最短匹配时,需要先调用 Action。
foreach (int acceptState in symbols)
for (int i = symbolStart; i < symbolEnd; i++)
{
var terminal = data.Terminals[acceptState];
var terminal = data.Terminals[states[i]];
if (terminal.UseShortest)
{
controller.DoAction(start, terminal);
Expand All @@ -102,7 +112,7 @@ public override bool NextToken(int state, int start)
}
}
// 将接受状态记录在堆栈中。
symbolStack.Push(symbols);
symbolStack.Push(new ValueTuple<int, int>(symbolStart, symbolEnd));
indexStack.Push(source.Index);
}
}
Expand All @@ -112,18 +122,18 @@ public override bool NextToken(int state, int start)
{
curSymbols = symbolStack.Pop();
int index = indexStack.Pop();
while (curSymbols.Count > 0)
while (curSymbols.Item1 < curSymbols.Item2)
{
int acceptState = curSymbols[0];
curSymbols = curSymbols.Slice(1);
int acceptState = states[curSymbols.Item1];
curSymbols.Item1++;
if (invalidStates.Contains(acceptState))
{
continue;
}
// 将文本和流调整到与接受状态匹配的状态。
source.Index = index;
// 每次都需要清空候选集合,并在使用时重新计算。
candidates = null;
isCandidatesValid = false;
controller.DoAction(start, data.Terminals[acceptState]);
if (!controller.IsReject)
{
Expand Down
55 changes: 34 additions & 21 deletions Runtime/Lexers/Core/RejectableTrailingCore`1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,27 @@ internal sealed class RejectableTrailingCore<T> : LexerCore<T>
/// <summary>
/// 接受符号的堆栈。
/// </summary>
private readonly ListStack<ArraySegment<int>> symbolStack = new();
private readonly ListStack<ValueTuple<int, int>> symbolStack = new();
/// <summary>
/// 接受索引的堆栈。
/// </summary>
private readonly ListStack<int> indexStack = new();
/// <summary>
/// 候选类型。
/// </summary>
private IReadOnlySet<T>? candidates;
private readonly HashSet<T> candidates = new();
/// <summary>
/// 无效的状态列表。
/// </summary>
private readonly HashSet<int> invalidStates = new();
/// <summary>
/// 当前候选符号。
/// </summary>
private ArraySegment<int> curSymbols;
private ValueTuple<int, int> curSymbols;
/// <summary>
/// 是否需要重新计算候选类型。
/// </summary>
private bool isCandidatesValid = false;

/// <summary>
/// 使用给定的词法分析器信息初始化 <see cref="RejectableTrailingCore{T}"/> 类的新实例。
Expand All @@ -47,13 +51,17 @@ public override IReadOnlySet<T> Candidates
{
get
{
if (candidates == null)
if (!isCandidatesValid)
{
HashSet<T> result = new();
isCandidatesValid = true;
candidates.Clear();
int[] states = data.States;
// 先添加当前候选
result.UnionWith(GetCandidates(curSymbols));
result.UnionWith(symbolStack.SelectMany(GetCandidates));
candidates = result.AsReadOnly();
GetCandidates(states, curSymbols, candidates);
for (int i = 0; i < symbolStack.Count; i++)
{
GetCandidates(states, symbolStack[i], candidates);
}
}
return candidates;
}
Expand All @@ -70,6 +78,8 @@ public override bool NextToken(int state, int start)
symbolStack.Clear();
indexStack.Clear();
int startIndex = source.Index;
int symbolStart = 0, symbolEnd = 0;
int[] states = data.States;
while (true)
{
state = NextState(state);
Expand All @@ -78,18 +88,19 @@ public override bool NextToken(int state, int start)
// 没有合适的转移,退出。
break;
}
ArraySegment<int> symbols = data.GetSymbols(state);
if (symbols.Count > 0)
if (data.GetSymbols(state, ref symbolStart, ref symbolEnd))
{
if (data.UseShortest)
{
// 保存流的索引,避免被误修改影响后续匹配。
int originIndex = source.Index;
// 最短匹配时不需要生成候选列表。
candidates = SetUtil.Empty<T>();
candidates.Clear();
isCandidatesValid = true;
// 使用最短匹配时,需要先调用 Action。
foreach (int acceptState in symbols)
for (int i = symbolStart; i < symbolEnd; i++)
{
int acceptState = states[i];
// 跳过向前看的头状态。
if (acceptState < 0)
{
Expand All @@ -109,7 +120,7 @@ public override bool NextToken(int state, int start)
}
}
// 将接受状态记录在堆栈中。
symbolStack.Push(symbols);
symbolStack.Push(new ValueTuple<int, int>(symbolStart, symbolEnd));
indexStack.Push(source.Index);
}
}
Expand All @@ -119,10 +130,10 @@ public override bool NextToken(int state, int start)
{
curSymbols = symbolStack.Pop();
int index = indexStack.Pop();
while (curSymbols.Count > 0)
while (curSymbols.Item1 < curSymbols.Item2)
{
int acceptState = curSymbols[0];
curSymbols = curSymbols.Slice(1);
int acceptState = states[curSymbols.Item1];
curSymbols.Item1++;
if (acceptState < 0)
{
// 跳过向前看的头状态。
Expand All @@ -134,7 +145,7 @@ public override bool NextToken(int state, int start)
}
AdjustIndex(acceptState, startIndex, index);
// 每次都需要清空候选集合,并在使用时重新计算。
candidates = null;
isCandidatesValid = false;
controller.DoAction(start, data.Terminals[acceptState]);
if (!controller.IsReject)
{
Expand All @@ -158,6 +169,7 @@ public override bool NextToken(int state, int start)
private void AdjustIndex(int state, int startIndex, int index)
{
TerminalData<T> terminal = data.Terminals[state];
int[] states = data.States;
int? trailing = terminal.Trailing;
if (trailing.HasValue)
{
Expand All @@ -179,7 +191,7 @@ private void AdjustIndex(int state, int startIndex, int index)
int target = -state - 1;
for (int i = 0; i < symbolStack.Count; i++)
{
if (ContainsTrailingHead(symbolStack[i], target))
if (ContainsTrailingHead(states, symbolStack[i], target))
{
index = indexStack[i];
break;
Expand All @@ -194,15 +206,16 @@ private void AdjustIndex(int state, int startIndex, int index)
/// <summary>
/// 返回指定的接受状态的符号索引中是否包含特定的向前看头状态。
/// </summary>
/// <param name="states">状态列表。</param>
/// <param name="symbols">接受状态的符号索引。</param>
/// <param name="target">目标向前看头状态。</param>
/// <returns>如果包含特定的目标,则为 <c>true</c>;否则为 <c>false</c>。</returns>
private static bool ContainsTrailingHead(ArraySegment<int> symbols, int target)
private static bool ContainsTrailingHead(int[] states, ValueTuple<int, int> symbols, int target)
{
// 在当前状态中查找,从后向前找。
for (int i = symbols.Count - 1; i >= 0; i--)
for (int i = symbols.Item2 - 1; i >= symbols.Item1; i--)
{
int idx = symbols[i];
int idx = states[i];
if (idx >= 0)
{
// 前面的状态已经不可能是向前看头状态了,所以直接退出。
Expand Down
23 changes: 23 additions & 0 deletions Runtime/Lexers/LexerData`1.cs
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,29 @@ public int NextState(int state, char ch)
return DfaStateData.InvalidState;
}

/// <summary>
/// 返回指定状态对应的符号索引。
/// </summary>
/// <param name="state">当前状态索引。</param>
/// <param name="start">起始索引(包含)。</param>
/// <param name="end">结束索引(不含)。</param>
/// <returns>是否找到了符号索引。</returns>
public bool GetSymbols(int state, ref int start, ref int end)
{
int offset = state * 4;
int count = states[offset + DfaStateData.SymbolsLengthOffset];
if (count == 0)
{
return false;
}
else
{
start = states[offset + DfaStateData.SymbolIndexOffset];
end = start + count;
return true;
}
}

/// <summary>
/// 返回指定状态对应的符号。
/// </summary>
Expand Down

0 comments on commit 938c1e8

Please sign in to comment.