forked from Tongyi-EconML/FinQwen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.py
44 lines (30 loc) · 933 Bytes
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""
Tokenizer to support Chinese
"""
import jieba
from abc import abstractmethod
from modelscope import AutoTokenizer
def singleton(cls):
instances = {}
def wrapper(*args, **kwargs):
if cls not in instances:
instances[cls] = cls(*args, **kwargs)
return instances[cls]
return wrapper
class Tokenizer(object):
@abstractmethod
def tokenize(self, text):
pass
@singleton
class JiebaTokenizer(Tokenizer):
def __init__(self, cut_all=False):
jieba.initialize()
self.cut_all = cut_all
def tokenize(self, text):
return list(jieba.cut(text, cut_all=self.cut_all))
@singleton
class MsTokenizer(Tokenizer):
def __init__(self, name="TongyiFinance/Tongyi-Finance-14B"):
self.tokenizer = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
def tokenize(self, text):
return self.tokenizer(text).get("input_ids", [])