tfidf代碼簡單實現


class TFIDF(object):
    """
    以一個圖書館為例,
    tf: 該單詞在圖書館某本書里出現的頻率
    idf: 1+log((圖書館所有書的數量+平滑系數)/(該單詞出現過的書的數量+平滑系數))
    tfidf = tf*idf,即對應該本書該詞的tfidf值
    """
    def __init__(self, corpus_,  stop_words, word_sep=' ', smooth_value=0.01):
        assert isinstance(corpus_, list), 'Not support this type corpus.'
        self.corpus = corpus_
        self.vob = defaultdict(int)
        self.word_sep = word_sep
        self.smooth_value = smooth_value
        self.doc_cnt = defaultdict(set)
        self.word_unq = set()
        self.stop_words = stop_words

    def get_tf_idf(self):
        filter_corpus = []
        for i, line in enumerate(self.corpus):
            if isinstance(line, str):
                line = line.split(self.word_sep)
            line = [i for i in line if i not in self.stop_words]
            filter_corpus.append(line)
            for w in line:
                self.vob[f'{i}_{w}'] += 1
                self.doc_cnt[w].add(i)
                self.word_unq.add(w)
        key_values = dict(zip(range(len(self.word_unq)), self.word_unq))
        output = np.zeros((len(self.corpus), len(self.word_unq)))
        for i, line in enumerate(filter_corpus):
            tmp_size = len(line)
            for j in range(output.shape[1]):
                w = key_values[j]
                w_ = f'{i}_{w}'
                if w in line:
                    output[i, j] = self.vob[w_]/tmp_size*(1+np.log((output.shape[0]+self.smooth_value)/(self.smooth_value+len(self.doc_cnt[w]))))
        return output


if __name__ == '__main__':
    # 每個列表類比為一本書
    corpus = [['this', 'is', 'a', 'simple', 'tfidf', 'code', 'but', 'code', 'might', 'has', 'bugs'],
              ['python', 'is', 'a', 'code', 'language', 'not', 'human', 'language'],
              ['learning', 'python', 'make', 'things', 'simple', 'but', 'not', 'simple', 'enough']]
    result = TFIDF(corpus, stop_words=['a'], smooth_value=1)
    print(result.get_tf_idf())

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM