TenderAutomateSystem/tfidf.py

60 lines
1.6 KiB
Python
Raw Normal View History

2025-01-10 14:53:24 +00:00
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project recommender
@File tf_idf.py
@IDE PyCharm
@Author rengengchen
@Time 2024/3/13 15:38
"""
import os.path
import re
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
re_num = re.compile(r'\d+')
stop_words_file = os.path.join(os.path.dirname(__file__), 'ai/stop_words.txt')
with open(stop_words_file, mode='r', encoding='utf-8') as f:
stop_words = set(word.strip() for word in f.readlines())
def filter_word(words):
filtered_words = []
for word in words:
if re_num.match(word):
continue
if word in stop_words:
continue
filtered_words.append(word)
return filtered_words
def jieba_tokenize(text):
words = jieba.lcut(text)
words = filter_word(words)
return words
class TFIDF:
def __init__(self, min_df, max_df, max_features):
self.min_df = min_df
self.max_df = max_df
self.max_features = max_features
def count_words(self, corpus):
cv = CountVectorizer(tokenizer=jieba_tokenize, min_df=self.min_df, max_df=self.max_df,
max_features=self.max_features)
return cv, cv.fit_transform(corpus)
def fit(self, corpus, get_feature_names=True):
cv, words_matrix = self.count_words(corpus)
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(words_matrix)
if get_feature_names:
return cv.get_feature_names_out(), tfidf.toarray()
# row doc column word
return tfidf.toarray()