generated from wystan_rin/template
60 lines
1.6 KiB
Python
60 lines
1.6 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: UTF-8 -*-
|
||
"""
|
||
@Project :recommender
|
||
@File :tf_idf.py
|
||
@IDE :PyCharm
|
||
@Author :rengengchen
|
||
@Time :2024/3/13 15:38
|
||
"""
|
||
import os.path
|
||
import re
|
||
|
||
import jieba
|
||
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||
|
||
re_num = re.compile(r'\d+')
|
||
|
||
stop_words_file = os.path.join(os.path.dirname(__file__), 'ai/stop_words.txt')
|
||
with open(stop_words_file, mode='r', encoding='utf-8') as f:
|
||
stop_words = set(word.strip() for word in f.readlines())
|
||
|
||
|
||
def filter_word(words):
|
||
filtered_words = []
|
||
for word in words:
|
||
if re_num.match(word):
|
||
continue
|
||
if word in stop_words:
|
||
continue
|
||
filtered_words.append(word)
|
||
return filtered_words
|
||
|
||
|
||
def jieba_tokenize(text):
|
||
words = jieba.lcut(text)
|
||
words = filter_word(words)
|
||
return words
|
||
|
||
|
||
class TFIDF:
|
||
def __init__(self, min_df, max_df, max_features):
|
||
self.min_df = min_df
|
||
self.max_df = max_df
|
||
self.max_features = max_features
|
||
|
||
def count_words(self, corpus):
|
||
cv = CountVectorizer(tokenizer=jieba_tokenize, min_df=self.min_df, max_df=self.max_df,
|
||
max_features=self.max_features)
|
||
return cv, cv.fit_transform(corpus)
|
||
|
||
def fit(self, corpus, get_feature_names=True):
|
||
cv, words_matrix = self.count_words(corpus)
|
||
|
||
tfidf_transformer = TfidfTransformer()
|
||
tfidf = tfidf_transformer.fit_transform(words_matrix)
|
||
if get_feature_names:
|
||
return cv.get_feature_names_out(), tfidf.toarray()
|
||
# row doc column word
|
||
return tfidf.toarray()
|