generated from wystan_rin/template
60 lines
1.6 KiB
Python
60 lines
1.6 KiB
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding: UTF-8 -*-
|
|||
|
"""
|
|||
|
@Project :recommender
|
|||
|
@File :tf_idf.py
|
|||
|
@IDE :PyCharm
|
|||
|
@Author :rengengchen
|
|||
|
@Time :2024/3/13 15:38
|
|||
|
"""
|
|||
|
import os.path
|
|||
|
import re
|
|||
|
|
|||
|
import jieba
|
|||
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
|||
|
|
|||
|
re_num = re.compile(r'\d+')
|
|||
|
|
|||
|
stop_words_file = os.path.join(os.path.dirname(__file__), 'ai/stop_words.txt')
|
|||
|
with open(stop_words_file, mode='r', encoding='utf-8') as f:
|
|||
|
stop_words = set(word.strip() for word in f.readlines())
|
|||
|
|
|||
|
|
|||
|
def filter_word(words):
|
|||
|
filtered_words = []
|
|||
|
for word in words:
|
|||
|
if re_num.match(word):
|
|||
|
continue
|
|||
|
if word in stop_words:
|
|||
|
continue
|
|||
|
filtered_words.append(word)
|
|||
|
return filtered_words
|
|||
|
|
|||
|
|
|||
|
def jieba_tokenize(text):
|
|||
|
words = jieba.lcut(text)
|
|||
|
words = filter_word(words)
|
|||
|
return words
|
|||
|
|
|||
|
|
|||
|
class TFIDF:
|
|||
|
def __init__(self, min_df, max_df, max_features):
|
|||
|
self.min_df = min_df
|
|||
|
self.max_df = max_df
|
|||
|
self.max_features = max_features
|
|||
|
|
|||
|
def count_words(self, corpus):
|
|||
|
cv = CountVectorizer(tokenizer=jieba_tokenize, min_df=self.min_df, max_df=self.max_df,
|
|||
|
max_features=self.max_features)
|
|||
|
return cv, cv.fit_transform(corpus)
|
|||
|
|
|||
|
def fit(self, corpus, get_feature_names=True):
|
|||
|
cv, words_matrix = self.count_words(corpus)
|
|||
|
|
|||
|
tfidf_transformer = TfidfTransformer()
|
|||
|
tfidf = tfidf_transformer.fit_transform(words_matrix)
|
|||
|
if get_feature_names:
|
|||
|
return cv.get_feature_names_out(), tfidf.toarray()
|
|||
|
# row doc column word
|
|||
|
return tfidf.toarray()
|