vocab#
- class Vocab(counter=None, max_size=None, min_freq=1, token_to_idx=None, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs)[源代码]#
基类:
object
The class used to convert between tokens and ids. It also includes some store/load functions.
- 参数:
counter (collections.Counter, optional) -- A Counter intance describes the tokens and their frequencies. Its keys will be indexed accroding to the order of frequency sorting to construct mapping relationship. If None,
token_to_idx
must be provided as the mapping relationship. Default: None.max_size (int, optional) -- Max size of vocab, not including special tokens. Default: None.
min_freq (int, optional) -- Ignore tokens whose frequencies are less than
min_freq
. Default: 1.token_to_idx (dict, optional) -- A dict specifies the mapping relationship between tokens and indices to be used. If provided, adjust the tokens and indices mapping according to it. If None, counter must be provided. Default: None.
unk_token (str, optional) -- Special token for unknow token. If no need, it also could be None. Default: None.
pad_token (str, optional) -- Special token for padding token. If no need, it also could be None. Default: None.
bos_token (str, optional) -- Special token for bos token. If no need, it also could be None. Default: None.
eos_token (str, optional) -- Special token for eos token. If no need, it lso could be None. Default: None.
kwargs (dict) -- Keyword arguments ending with
_token
. It can be used to specify further special tokens that will be exposed as attribute of the vocabulary and associated with an index.
- to_tokens(indices)[源代码]#
Maps the input indices to token list.
- 参数:
indices (int|list[int]|tuple[int]|numpy.ndarray) -- The input indice(s) for mapping. Must be an
int
or 1Dlist[int]`|`tuple[int]`|`numpy.ndarray
.- 返回:
Obtained token(s). If
indices
is an integer, it will return a str. Ifindices
is a list/tuple of integers, it will return a list of str.- 返回类型:
str|list[str]
示例
from paddlenlp.data import Vocab # The vocab file. The sample file can be downloaded firstly. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt vocab_file_path = './senta_word_dict.txt' # Initialize the Vocab vocab = Vocab.load_vocabulary( vocab_file_path, unk_token='[UNK]', pad_token='[PAD]') tokens = vocab.to_tokens([0, 1, 2, 3]) print(tokens) # ['[PAD]', '[UNK]', '一斤三', '意面屋']
- to_indices(tokens)[源代码]#
Maps the input tokens into indices.
- 参数:
tokens (str|list[str]|tuple[str], optional) -- The input token(s) for mapping.
- 返回:
Obationed indice(s). If
tokens
is a str, it will return an integer. Iftokens
is a list/tuple of str, it will return a list of integers.- 返回类型:
int|list[int]
示例
from paddlenlp.data import Vocab # The vocab file. The sample file can be downloaded firstly. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt vocab_file_path = './senta_word_dict.txt' # Initialize the Vocab vocab = Vocab.load_vocabulary( vocab_file_path, unk_token='[UNK]', pad_token='[PAD]') tokens = vocab.to_indices(['[PAD]', '[UNK]', '一斤三', '意面屋']) print(tokens) # [0, 1, 2, 3]
- __call__(tokens)[源代码]#
Maps the input tokens into indices. Its function is the same as the
to_indices()
method.See detail at
to_indices
.
- to_json(path=None)[源代码]#
Summarizes some information of vocab as JSON string. If path is gaven, the JSON string will be saved into files. The JSON string and the saved file all can be used to reconstruct the
Vocab
by callingfrom_json()
method.- 参数:
path (str, optional) -- The path to save JSON string. If None, the JSON will not be saved. Default: None.
- 返回:
The JSON string including information of vocab.
- 返回类型:
str
示例
from paddlenlp.data import Vocab # The vocab file. The sample file can be downloaded firstly. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt vocab_file_path = './senta_word_dict.txt' # Initialize the Vocab vocab = Vocab.load_vocabulary( vocab_file_path, unk_token='[UNK]', pad_token='[PAD]') json_str = vocab.to_json(path='./vocab.json')
- classmethod from_json(json_str)[源代码]#
Loads
Vocab
from JSON string or JSON file, which is gotten by callingto_json()
method.- 参数:
json_str (str) -- JSON string or file path of JSON string.
- 返回:
An instance of
Vocab
generated from information contained in JSON string.- 返回类型:
示例
from paddlenlp.data import Vocab # The vocab file. The sample file can be downloaded firstly. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt vocab_file_path = './senta_word_dict.txt' # Initialize the Vocab vocab = Vocab.load_vocabulary( vocab_file_path, unk_token='[UNK]', pad_token='[PAD]') json_str = vocab.to_json(path='./vocab.json') vocab1 = Vocab.from_json(json_str) vocab2 = Vocab.from_json('./vocab.json') print(len(vocab), len(vocab1), len(vocab2)) # 1256608 1256608 1256608
- classmethod from_dict(token_to_idx, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs)[源代码]#
Builds the
Vocab
from a dict.- 参数:
token_to_idx (dict) -- A dict describes the mapping relationship between tokens and indices.
unk_token (str, optional) -- The special token for unknow token. If no need, it also could be None. Default: None.
pad_token (str, optional) -- The special token for padding token. If no need, it also could be None. Default: None.
bos_token (str, optional) -- The special token for bos token. If no need, it also could be None. Default: None.
eos_token (str, optional) -- The special token for eos token. If no need, it also could be None. Default: None.
kwargs (dict) -- Keyword arguments ending with
_token
. It can be used to specify further special tokens that will be exposed as attribute of the vocabulary and associated with an index.
- 返回:
An instance of
Vocab
generated from the given dict and special tokens.- 返回类型:
示例
from paddlenlp.data import Vocab # The vocab file. The sample file can be downloaded firstly. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt vocab_file_path = './senta_word_dict.txt' # Initialize the Vocab vocab = Vocab.load_vocabulary( vocab_file_path, unk_token='[UNK]', pad_token='[PAD]') vocab1 = Vocab.from_dict(vocab.token_to_idx) print(len(vocab), len(vocab.token_to_idx), len(vocab1)) # 1256608 1256608 1256608
- static build_vocab(iterator, max_size=None, min_freq=1, token_to_idx=None, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs)[源代码]#
Builds the
Vocab
accoring to given iterator and other information. Firstly, iterate over theiterator
to construct acollections.Counter
and used to init the asVocab
.- 参数:
iterator (collections.Iterable) -- Iterator of tokens. Each element should be a list of tokens if wordlevel vocab is needed.
max_size (int, optional) -- The max size of vocab, not including special tokens. Default: None.
min_freq (int, optional) -- Ignore tokens whose frequencies are less than
min_freq
. Default: 1.token_to_idx (dict, optional) -- A dict specifies the mapping relationship between tokens and indices to be used. If provided, adjust the tokens and indices mapping according to it. If None, counter must be provided. Default: None.
unk_token (str, optional) -- The special token for unknow token '<unk>'. If no need, it also could be None. Default: None.
pad_token (str, optional) -- The special token for padding token '<pad>'. If no need, it also could be None. Default: None.
bos_token (str, optional) -- The special token for bos token '<bos>'. If no need, it also could be None. Default: None.
eos_token (str, optional) -- The special token for eos token '<eos>'. If no need, it also could be None. Default: None.
kwargs (dict) -- Keyword arguments ending with
_token
. It can be used to specify further special tokens that will be exposed as attribute of the vocabulary and associated with an index.
- 返回:
An instance of
Vocab
generated from given iterator and other informations.- 返回类型:
示例
from paddlenlp.data import Vocab # The vocab file. The sample file can be downloaded firstly. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt vocab_file_path = './senta_word_dict.txt' # Initialize the Vocab vocab = Vocab.load_vocabulary( vocab_file_path, unk_token='[UNK]', pad_token='[PAD]') vocab1 = Vocab.build_vocab([list(vocab.token_to_idx.keys())]) print(len(vocab), len(vocab1)) # 1256608 1256608
- static load_vocabulary(filepath, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs)[源代码]#
Builds the
Vocab
from a file reserving all tokens by callingVocab.from_dict()
method. The file contains a token per line, and the line index would be the index of corresponding token.- 参数:
filepath (str) -- the path of file to construct vocabulary.
unk_token (str, optional) -- special token for unknown token. If no need, it also could be None. Default: None.
pad_token (str, optional) -- special token for padding token. If no need, it also could be None. Default: None.
bos_token (str, optional) -- special token for bos token. If no need, it also could be None. Default: None.
eos_token (str, optional) -- special token for eos token. If no need, it also could be None. Default: None.
kwargs (dict) -- Keyword arguments ending with
_token
. It can be used to specify further special tokens that will be exposed as attribute of the vocabulary and associated with an index.
- 返回:
An instance of
Vocab
generated from the given file.- 返回类型:
示例
from paddlenlp.data import Vocab # The vocab file. The sample file can be downloaded firstly. # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt vocab_file_path = './senta_word_dict.txt' # Initialize the Vocab vocab = Vocab.load_vocabulary( vocab_file_path, unk_token='[UNK]', pad_token='[PAD]') print(len(vocab)) # 1256608
- save_vocabulary(filepath)[源代码]#
Save the
Vocab
to a specific file. Can be reloaded by callingload_vocabulary
.- 参数:
filepath (str) -- the path of file to save vocabulary.