class TFIDF(object):
"""
以一個圖書館為例,
tf: 該單詞在圖書館某本書里出現的頻率
idf: 1+log((圖書館所有書的數量+平滑系數)/(該單詞出現過的書的數量+平滑系數))
tfidf = tf*idf,即對應該本書該詞的tfidf值
"""
def __init__(self, corpus_, stop_words, word_sep=' ', smooth_value=0.01):
assert isinstance(corpus_, list), 'Not support this type corpus.'
self.corpus = corpus_
self.vob = defaultdict(int)
self.word_sep = word_sep
self.smooth_value = smooth_value
self.doc_cnt = defaultdict(set)
self.word_unq = set()
self.stop_words = stop_words
def get_tf_idf(self):
filter_corpus = []
for i, line in enumerate(self.corpus):
if isinstance(line, str):
line = line.split(self.word_sep)
line = [i for i in line if i not in self.stop_words]
filter_corpus.append(line)
for w in line:
self.vob[f'{i}_{w}'] += 1
self.doc_cnt[w].add(i)
self.word_unq.add(w)
key_values = dict(zip(range(len(self.word_unq)), self.word_unq))
output = np.zeros((len(self.corpus), len(self.word_unq)))
for i, line in enumerate(filter_corpus):
tmp_size = len(line)
for j in range(output.shape[1]):
w = key_values[j]
w_ = f'{i}_{w}'
if w in line:
output[i, j] = self.vob[w_]/tmp_size*(1+np.log((output.shape[0]+self.smooth_value)/(self.smooth_value+len(self.doc_cnt[w]))))
return output
if __name__ == '__main__':
# 每個列表類比為一本書
corpus = [['this', 'is', 'a', 'simple', 'tfidf', 'code', 'but', 'code', 'might', 'has', 'bugs'],
['python', 'is', 'a', 'code', 'language', 'not', 'human', 'language'],
['learning', 'python', 'make', 'things', 'simple', 'but', 'not', 'simple', 'enough']]
result = TFIDF(corpus, stop_words=['a'], smooth_value=1)
print(result.get_tf_idf())