本文主要介紹了Rasa中相關Tokenizer的具體實現,包括預設Tokenizer和第三方Tokenizer。前者包括JiebaTokenizer、MitieTokenizer、SpacyTokenizer和WhitespaceTokenizer,後者包括BertTokenizer和AnotherWhitespaceTokenizer。
一.JiebaTokenizer
JiebaTokenizer類整體程式碼結構,如下所示:
載入自定義字典程式碼,如下所示[3]:
@staticmethod
def _load_custom_dictionary(path: Text) -> None:
"""Load all the custom dictionaries stored in the path. # 載入儲存在路徑中的所有自定義字典。
More information about the dictionaries file format can be found in the documentation of jieba. https://github.com/fxsjy/jieba#load-dictionary
"""
print("JiebaTokenizer._load_custom_dictionary()")
import jieba
jieba_userdicts = glob.glob(f"{path}/*") # 獲取路徑下的所有檔案。
for jieba_userdict in jieba_userdicts: # 遍歷所有檔案。
logger.info(f"Loading Jieba User Dictionary at {jieba_userdict}") # 載入結巴使用者字典。
jieba.load_userdict(jieba_userdict) # 載入使用者字典。
實現分詞的程式碼為tokenize()
方法,如下所示:
def tokenize(self, message: Message, attribute: Text) -> List[Token]:
"""Tokenizes the text of the provided attribute of the incoming message.""" # 對傳入訊息的提供屬性的文字進行tokenize。
print("JiebaTokenizer.tokenize()")
import jieba
text = message.get(attribute) # 獲取訊息的屬性
tokenized = jieba.tokenize(text) # 對文字進行標記化
tokens = [Token(word, start) for (word, start, end) in tokenized] # 生成標記
return self._apply_token_pattern(tokens)
self._apply_token_pattern(tokens)
資料型別為List[Token]。Token的資料型別為:
class Token:
# 由將單個訊息拆分為多個Token的Tokenizers使用
def __init__(
self,
text: Text,
start: int,
end: Optional[int] = None,
data: Optional[Dict[Text, Any]] = None,
lemma: Optional[Text] = None,
) -> None:
"""建立一個Token
Args:
text: The token text. # token文字
start: The start index of the token within the entire message. # token在整個訊息中的起始索引
end: The end index of the token within the entire message. # token在整個訊息中的結束索引
data: Additional token data. # 附加的token資料
lemma: An optional lemmatized version of the token text. # token文字的可選詞形還原版本
"""
self.text = text
self.start = start
self.end = end if end else start + len(text)
self.data = data if data else {}
self.lemma = lemma or text
特別說明:JiebaTokenizer元件的is_trainable=True。
二.MitieTokenizer
MitieTokenizer類整體程式碼結構,如下所示:
核心程式碼tokenize()方法程式碼,如下所示:
def tokenize(self, message: Message, attribute: Text) -> List[Token]:
"""Tokenizes the text of the provided attribute of the incoming message.""" # 對傳入訊息的提供屬性的文字進行tokenize
import mitie
text = message.get(attribute)
encoded_sentence = text.encode(DEFAULT_ENCODING)
tokenized = mitie.tokenize_with_offsets(encoded_sentence)
tokens = [
self._token_from_offset(token, offset, encoded_sentence)
for token, offset in tokenized
]
return self._apply_token_pattern(tokens)
特別說明:mitie庫在Windows上安裝可能麻煩些。MitieTokenizer元件的is_trainable=False。
三.SpacyTokenizer
首先安裝Spacy類庫和模型[4][5],如下所示:
pip3 install -U spacy
python3 -m spacy download zh_core_web_sm
SpacyTokenizer類整體程式碼結構,如下所示: 核心程式碼tokenize()方法程式碼,如下所示:
def tokenize(self, message: Message, attribute: Text) -> List[Token]:
"""Tokenizes the text of the provided attribute of the incoming message.""" # 對傳入訊息的提供屬性的文字進行tokenize
doc = self._get_doc(message, attribute) # doc是一個Doc物件
if not doc:
return []
tokens = [
Token(
t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)}
)
for t in doc
if t.text and t.text.strip()
]
特別說明:SpacyTokenizer元件的is_trainable=False。即SpacyTokenizer只有執行元件run_SpacyTokenizer0
,沒有訓練元件。如下所示:
四.WhitespaceTokenizer
WhitespaceTokenizer主要是針對英文的,不可用於中文。WhitespaceTokenizer類整體程式碼結構,如下所示:
其中,predict_schema和train_schema,如下所示:
rasa shell nlu --debug
結果,如下所示:
特別說明:WhitespaceTokenizer元件的is_trainable=False。
五.BertTokenizer
rasa shell nlu --debug
結果,如下所示:
  BertTokenizer程式碼具體實現,如下所示:
"""
https://github.com/daiyizheng/rasa-chinese-plus/blob/master/rasa_chinese_plus/nlu/tokenizers/bert_tokenizer.py
"""
from typing import List, Text, Dict, Any
from rasa.engine.recipes.default_recipe import DefaultV1Recipe
from rasa.shared.nlu.training_data.message import Message
from transformers import AutoTokenizer
from rasa.nlu.tokenizers.tokenizer import Tokenizer, Token
@DefaultV1Recipe.register(
DefaultV1Recipe.ComponentType.MESSAGE_TOKENIZER, is_trainable=False
)
class BertTokenizer(Tokenizer):
def __init__(self, config: Dict[Text, Any] = None) -> None:
"""
:param config: {"pretrained_model_name_or_path":"", "cache_dir":"", "use_fast":""}
"""
super().__init__(config)
self.tokenizer = AutoTokenizer.from_pretrained(
config["pretrained_model_name_or_path"], # 指定預訓練模型的名稱或路徑
cache_dir=config.get("cache_dir"), # 指定快取目錄
use_fast=True if config.get("use_fast") else False # 是否使用快速模式
)
@classmethod
def required_packages(cls) -> List[Text]:
return ["transformers"] # 指定依賴的包
@staticmethod
def get_default_config() -> Dict[Text, Any]:
"""The component's default config (see parent class for full docstring)."""
return {
# Flag to check whether to split intents
"intent_tokenization_flag": False,
# Symbol on which intent should be split
"intent_split_symbol": "_",
# Regular expression to detect tokens
"token_pattern": None,
# Symbol on which prefix should be split
"prefix_separator_symbol": None,
}
def tokenize(self, message: Message, attribute: Text) -> List[Token]:
text = message.get(attribute) # 獲取文字
encoded_input = self.tokenizer(text, return_offsets_mapping=True, add_special_tokens=False) # 編碼文字
token_position_pair = zip(encoded_input.tokens(), encoded_input["offset_mapping"]) # 將編碼後的文字和偏移量對映成一個元組
tokens = [Token(text=token_text, start=position[0], end=position[1]) for token_text, position in token_position_pair] # 將元組轉換成Token物件
return self._apply_token_pattern(tokens)
特別說明:BertTokenizer元件的is_trainable=False。
六.AnotherWhitespaceTokenizer
AnotherWhitespaceTokenizer程式碼具體實現,如下所示:
from __future__ import annotations
from typing import Any, Dict, List, Optional, Text
from rasa.engine.graph import ExecutionContext
from rasa.engine.recipes.default_recipe import DefaultV1Recipe
from rasa.engine.storage.resource import Resource
from rasa.engine.storage.storage import ModelStorage
from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
from rasa.shared.nlu.training_data.message import Message
@DefaultV1Recipe.register(
DefaultV1Recipe.ComponentType.MESSAGE_TOKENIZER, is_trainable=False
)
class AnotherWhitespaceTokenizer(Tokenizer):
"""Creates features for entity extraction."""
@staticmethod
def not_supported_languages() -> Optional[List[Text]]:
"""The languages that are not supported."""
return ["zh", "ja", "th"]
@staticmethod
def get_default_config() -> Dict[Text, Any]:
"""Returns the component's default config."""
return {
# This *must* be added due to the parent class.
"intent_tokenization_flag": False,
# This *must* be added due to the parent class.
"intent_split_symbol": "_",
# This is a, somewhat silly, config that we pass
"only_alphanum": True,
}
def __init__(self, config: Dict[Text, Any]) -> None:
"""Initialize the tokenizer."""
super().__init__(config)
self.only_alphanum = config["only_alphanum"]
def parse_string(self, s):
if self.only_alphanum:
return "".join([c for c in s if ((c == " ") or str.isalnum(c))])
return s
@classmethod
def create(
cls,
config: Dict[Text, Any],
model_storage: ModelStorage,
resource: Resource,
execution_context: ExecutionContext,
) -> AnotherWhitespaceTokenizer:
return cls(config)
def tokenize(self, message: Message, attribute: Text) -> List[Token]:
text = self.parse_string(message.get(attribute))
words = [w for w in text.split(" ") if w]
# if we removed everything like smiles `:)`, use the whole text as 1 token
if not words:
words = [text]
# the ._convert_words_to_tokens() method is from the parent class.
tokens = self._convert_words_to_tokens(words, text)
return self._apply_token_pattern(tokens)
特別說明:AnotherWhitespaceTokenizer元件的is_trainable=False。
參考文獻:
[1]自定義Graph Component:1.1-JiebaTokenizer具體實現:https://mp.weixin.qq.com/s/awGiGn3uJaNcvJBpk4okCA
[2]https://github.com/RasaHQ/rasa
[3]https://github.com/fxsjy/jieba#load-dictionary
[4]spaCy GitHub:https://github.com/explosion/spaCy
[5]spaCy官網:https://spacy.io/
[6]https://github.com/daiyizheng/rasa-chinese-plus/blob/master/rasa_chinese_plus/nlu/tokenizers/bert_tokenizer.py