#!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project :recommender @File :tf_idf.py @IDE :PyCharm @Author :rengengchen @Time :2024/3/13 15:38 """ import os.path import re import jieba from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer re_num = re.compile(r'\d+') stop_words_file = os.path.join(os.path.dirname(__file__), 'ai/stop_words.txt') with open(stop_words_file, mode='r', encoding='utf-8') as f: stop_words = set(word.strip() for word in f.readlines()) def filter_word(words): filtered_words = [] for word in words: if re_num.match(word): continue if word in stop_words: continue filtered_words.append(word) return filtered_words def jieba_tokenize(text): words = jieba.lcut(text) words = filter_word(words) return words class TFIDF: def __init__(self, min_df, max_df, max_features): self.min_df = min_df self.max_df = max_df self.max_features = max_features def count_words(self, corpus): cv = CountVectorizer(tokenizer=jieba_tokenize, min_df=self.min_df, max_df=self.max_df, max_features=self.max_features) return cv, cv.fit_transform(corpus) def fit(self, corpus, get_feature_names=True): cv, words_matrix = self.count_words(corpus) tfidf_transformer = TfidfTransformer() tfidf = tfidf_transformer.fit_transform(words_matrix) if get_feature_names: return cv.get_feature_names_out(), tfidf.toarray() # row doc column word return tfidf.toarray()