deftokenize(sentences, token='word'): """Split sentences into word or char tokens""" if token == 'word': return [sentence.split(' ') for sentence in sentences] elif token == 'char': return [list(sentence) for sentence in sentences] else: print('ERROR: unkown token type ' + token)
classVocab(object): def__init__(self, tokens, min_freq=0, use_special_tokens=False): counter = count_corpus(tokens) # : self.token_freqs = list(counter.items()) self.idx_to_token = [] if use_special_tokens: # padding, begin of sentence, end of sentence, unknown self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3) self.idx_to_token += ['', '', '', ''] else: self.unk = 0 self.idx_to_token += [''] self.idx_to_token += [token for token, freq in self.token_freqs if freq >= min_freq and token notin self.idx_to_token] self.token_to_idx = dict() for idx, token in enumerate(self.idx_to_token): self.token_to_idx[token] = idx
def__len__(self): return len(self.idx_to_token)
def__getitem__(self, tokens): ifnot isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens]
defto_tokens(self, indices): ifnot isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices]
defcount_corpus(sentences): tokens = [tk for st in sentences for tk in st] return collections.Counter(tokens) # 返回一个字典,记录每个词的出现次数