TenderAutomateSystem/tfidf.py

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project ：recommender
@File    ：tf_idf.py
@IDE     ：PyCharm
@Author  ：rengengchen
@Time    ：2024/3/13 15:38
"""
import os.path
import re

import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

re_num = re.compile(r'\d+')

stop_words_file = os.path.join(os.path.dirname(__file__), 'ai/stop_words.txt')
with open(stop_words_file, mode='r', encoding='utf-8') as f:
    stop_words = set(word.strip() for word in f.readlines())


def filter_word(words):
    filtered_words = []
    for word in words:
        if re_num.match(word):
            continue
        if word in stop_words:
            continue
        filtered_words.append(word)
    return filtered_words


def jieba_tokenize(text):
    words = jieba.lcut(text)
    words = filter_word(words)
    return words


class TFIDF:
    def __init__(self, min_df, max_df, max_features):
        self.min_df = min_df
        self.max_df = max_df
        self.max_features = max_features

    def count_words(self, corpus):
        cv = CountVectorizer(tokenizer=jieba_tokenize, min_df=self.min_df, max_df=self.max_df,
                             max_features=self.max_features)
        return cv, cv.fit_transform(corpus)

    def fit(self, corpus, get_feature_names=True):
        cv, words_matrix = self.count_words(corpus)

        tfidf_transformer = TfidfTransformer()
        tfidf = tfidf_transformer.fit_transform(words_matrix)
        if get_feature_names:
            return cv.get_feature_names_out(), tfidf.toarray()
        # row doc   column word
        return tfidf.toarray()