From aadb8e9a1e58708d78dd344b9184a36b8ed05248 Mon Sep 17 00:00:00 2001 From: Saarth Shah <69341449+SaarthShah@users.noreply.github.com> Date: Thu, 11 Jul 2024 10:30:29 -0700 Subject: [PATCH 1/8] added support for load json --- symspellpy/symspellpy.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py index d6b3b6e..ef02cab 100644 --- a/symspellpy/symspellpy.py +++ b/symspellpy/symspellpy.py @@ -315,6 +315,9 @@ def load_bigram_dictionary( infile, term_index, count_index, separator ) + def load_json(self, corpus): + self.words = corpus + def load_dictionary( self, corpus: Union[Path, str], From 359e380bb65ff3729f0a4e0e8a34715ed354f4ad Mon Sep 17 00:00:00 2001 From: Saarth Shah <69341449+SaarthShah@users.noreply.github.com> Date: Thu, 11 Jul 2024 10:57:04 -0700 Subject: [PATCH 2/8] adding _words --- symspellpy/symspellpy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py index ef02cab..372cf65 100644 --- a/symspellpy/symspellpy.py +++ b/symspellpy/symspellpy.py @@ -316,7 +316,7 @@ def load_bigram_dictionary( ) def load_json(self, corpus): - self.words = corpus + self._words = corpus['Count'] def load_dictionary( self, From 4cb585ca70844f5cbad01c0ebf080db4bc1f80d3 Mon Sep 17 00:00:00 2001 From: Saarth Shah <69341449+SaarthShah@users.noreply.github.com> Date: Thu, 11 Jul 2024 10:58:45 -0700 Subject: [PATCH 3/8] removed Count key --- symspellpy/symspellpy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py index 372cf65..da68fcf 100644 --- a/symspellpy/symspellpy.py +++ b/symspellpy/symspellpy.py @@ -316,7 +316,7 @@ def load_bigram_dictionary( ) def load_json(self, corpus): - self._words = corpus['Count'] + self._words = corpus def load_dictionary( self, From 6cdacdc58216ef81bbb36e15e020b01b5145aeb1 Mon Sep 17 00:00:00 2001 From: Saarth Shah <69341449+SaarthShah@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:03:15 -0700 Subject: [PATCH 4/8] json update --- symspellpy/symspellpy.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py index da68fcf..40c990d 100644 --- a/symspellpy/symspellpy.py +++ b/symspellpy/symspellpy.py @@ -315,8 +315,19 @@ def load_bigram_dictionary( infile, term_index, count_index, separator ) - def load_json(self, corpus): + def load_json(self, corpus: Dict[str, int]) -> None: + """Loads dictionary data from a JSON object. + + Args: + corpus: A dictionary where keys are words and values are their frequencies. + """ self._words = corpus + self._max_length = max(map(len, self._words.keys()), default=0) + self._deletes.clear() + for key in self._words: + edits = self._edits_prefix(key) + for delete in edits: + self._deletes[delete].append(key) def load_dictionary( self, From 12033d6385adeb9adaefe13f5aff5d732025b5ef Mon Sep 17 00:00:00 2001 From: Saarth Shah <69341449+SaarthShah@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:06:29 -0700 Subject: [PATCH 5/8] faster loading --- symspellpy/symspellpy.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py index 40c990d..775709e 100644 --- a/symspellpy/symspellpy.py +++ b/symspellpy/symspellpy.py @@ -324,11 +324,18 @@ def load_json(self, corpus: Dict[str, int]) -> None: self._words = corpus self._max_length = max(map(len, self._words.keys()), default=0) self._deletes.clear() + + # Use a dictionary to collect all deletes first + deletes_dict = defaultdict(list) for key in self._words: edits = self._edits_prefix(key) for delete in edits: - self._deletes[delete].append(key) - + deletes_dict[delete].append(key) + + # Update self._deletes in one go + self._deletes.update(deletes_dict) + + def load_dictionary( self, corpus: Union[Path, str], From 4670b5e8308598b6570f8d62490982c18e057a92 Mon Sep 17 00:00:00 2001 From: Saarth Shah <69341449+SaarthShah@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:39:12 -0700 Subject: [PATCH 6/8] async loading --- symspellpy/symspellpy.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py index 775709e..6334399 100644 --- a/symspellpy/symspellpy.py +++ b/symspellpy/symspellpy.py @@ -28,6 +28,7 @@ from itertools import cycle from pathlib import Path from typing import IO, Dict, List, Optional, Pattern, Set, Union +import concurrent.futures from symspellpy import helpers from symspellpy.composition import Composition @@ -327,10 +328,16 @@ def load_json(self, corpus: Dict[str, int]) -> None: # Use a dictionary to collect all deletes first deletes_dict = defaultdict(list) - for key in self._words: + + def process_key(key): edits = self._edits_prefix(key) - for delete in edits: - deletes_dict[delete].append(key) + return [(delete, key) for delete in edits] + + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [executor.submit(process_key, key) for key in self._words] + for future in concurrent.futures.as_completed(futures): + for delete, key in future.result(): + deletes_dict[delete].append(key) # Update self._deletes in one go self._deletes.update(deletes_dict) From 05bcb24c9a25c0ae747231745735b0d4dc685e1e Mon Sep 17 00:00:00 2001 From: Saarth Shah <69341449+SaarthShah@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:41:35 -0700 Subject: [PATCH 7/8] load_json back to async --- symspellpy/symspellpy.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py index 6334399..1c3664d 100644 --- a/symspellpy/symspellpy.py +++ b/symspellpy/symspellpy.py @@ -28,7 +28,6 @@ from itertools import cycle from pathlib import Path from typing import IO, Dict, List, Optional, Pattern, Set, Union -import concurrent.futures from symspellpy import helpers from symspellpy.composition import Composition @@ -328,19 +327,14 @@ def load_json(self, corpus: Dict[str, int]) -> None: # Use a dictionary to collect all deletes first deletes_dict = defaultdict(list) - - def process_key(key): + for key in self._words: edits = self._edits_prefix(key) - return [(delete, key) for delete in edits] - - with concurrent.futures.ThreadPoolExecutor() as executor: - futures = [executor.submit(process_key, key) for key in self._words] - for future in concurrent.futures.as_completed(futures): - for delete, key in future.result(): - deletes_dict[delete].append(key) + for delete in edits: + deletes_dict[delete].append(key) # Update self._deletes in one go self._deletes.update(deletes_dict) + return True def load_dictionary( From 413820a003a93a3b06bbdf4cc566cb7e6e3e9559 Mon Sep 17 00:00:00 2001 From: Saarth Shah <69341449+SaarthShah@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:46:25 -0700 Subject: [PATCH 8/8] faster loading --- symspellpy/symspellpy.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/symspellpy/symspellpy.py b/symspellpy/symspellpy.py index 1c3664d..9cb3c6c 100644 --- a/symspellpy/symspellpy.py +++ b/symspellpy/symspellpy.py @@ -323,17 +323,15 @@ def load_json(self, corpus: Dict[str, int]) -> None: """ self._words = corpus self._max_length = max(map(len, self._words.keys()), default=0) - self._deletes.clear() # Use a dictionary to collect all deletes first deletes_dict = defaultdict(list) for key in self._words: - edits = self._edits_prefix(key) - for delete in edits: + for delete in self._edits_prefix(key): deletes_dict[delete].append(key) # Update self._deletes in one go - self._deletes.update(deletes_dict) + self._deletes = deletes_dict return True