TenderAutomateSystem/tfidf.py

60 lines
1.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project recommender
@File tf_idf.py
@IDE PyCharm
@Author rengengchen
@Time 2024/3/13 15:38
"""
import os.path
import re
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
re_num = re.compile(r'\d+')
stop_words_file = os.path.join(os.path.dirname(__file__), 'ai/stop_words.txt')
with open(stop_words_file, mode='r', encoding='utf-8') as f:
stop_words = set(word.strip() for word in f.readlines())
def filter_word(words):
filtered_words = []
for word in words:
if re_num.match(word):
continue
if word in stop_words:
continue
filtered_words.append(word)
return filtered_words
def jieba_tokenize(text):
words = jieba.lcut(text)
words = filter_word(words)
return words
class TFIDF:
def __init__(self, min_df, max_df, max_features):
self.min_df = min_df
self.max_df = max_df
self.max_features = max_features
def count_words(self, corpus):
cv = CountVectorizer(tokenizer=jieba_tokenize, min_df=self.min_df, max_df=self.max_df,
max_features=self.max_features)
return cv, cv.fit_transform(corpus)
def fit(self, corpus, get_feature_names=True):
cv, words_matrix = self.count_words(corpus)
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(words_matrix)
if get_feature_names:
return cv.get_feature_names_out(), tfidf.toarray()
# row doc column word
return tfidf.toarray()