generated from wystan_rin/template
上传聚类结果
This commit is contained in:
parent
c022a5b06b
commit
81f10f8187
|
@ -0,0 +1,124 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project :recommender
|
||||||
|
@File :kmeans.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Time :2023/12/29 11:53
|
||||||
|
"""
|
||||||
|
import torch
|
||||||
|
import time
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
class KMEANS:
|
||||||
|
def __init__(self, n_clusters=20, max_iter=None, verbose=True, device=torch.device("cpu")):
|
||||||
|
|
||||||
|
self.n_clusters = n_clusters
|
||||||
|
self.labels = None
|
||||||
|
self.dists = None # shape: [x.shape[0],n_cluster]
|
||||||
|
self.centers = None
|
||||||
|
self.variation = torch.Tensor([float("Inf")]).to(device)
|
||||||
|
self.verbose = verbose
|
||||||
|
self.started = False
|
||||||
|
self.representative_samples = None
|
||||||
|
self.max_iter = max_iter
|
||||||
|
self.count = 0
|
||||||
|
self.device = device
|
||||||
|
|
||||||
|
def fit(self, X):
|
||||||
|
x = int(np.random.uniform(0, k))
|
||||||
|
self.centers = x[0].reshape(1, -1)
|
||||||
|
# kmeans++
|
||||||
|
for i in range(self.n_clusters - 1):
|
||||||
|
dis = 0
|
||||||
|
for j, cj in enumerate(self.centers):
|
||||||
|
d = ((x - cj) ** 2).sum(1)
|
||||||
|
if j == 0:
|
||||||
|
dis = d
|
||||||
|
else:
|
||||||
|
dis += d
|
||||||
|
self.centers = torch.cat((self.centroids, x[dis.argmax(0)].reshape(1, -1)), 0)
|
||||||
|
|
||||||
|
self.centers = init_points
|
||||||
|
while True:
|
||||||
|
# 聚类标记
|
||||||
|
self.nearest_center(x)
|
||||||
|
# 更新中心点
|
||||||
|
self.update_center(x)
|
||||||
|
if self.verbose:
|
||||||
|
print(self.variation, torch.argmin(self.dists, (0)))
|
||||||
|
if torch.abs(self.variation) < 1e-3 and self.max_iter is None:
|
||||||
|
break
|
||||||
|
elif self.max_iter is not None and self.count == self.max_iter:
|
||||||
|
break
|
||||||
|
|
||||||
|
self.count += 1
|
||||||
|
|
||||||
|
self.representative_sample()
|
||||||
|
|
||||||
|
def nearest_center(self, x):
|
||||||
|
labels = torch.empty((x.shape[0],)).long().to(self.device)
|
||||||
|
dists = torch.empty((0, self.n_clusters)).to(self.device)
|
||||||
|
for i, sample in enumerate(x):
|
||||||
|
dist = torch.sum(torch.mul(sample - self.centers, sample - self.centers), (1))
|
||||||
|
labels[i] = torch.argmin(dist)
|
||||||
|
dists = torch.cat([dists, dist.unsqueeze(0)], (0))
|
||||||
|
self.labels = labels
|
||||||
|
if self.started:
|
||||||
|
self.variation = torch.sum(self.dists - dists)
|
||||||
|
self.dists = dists
|
||||||
|
self.started = True
|
||||||
|
|
||||||
|
def update_center(self, x):
|
||||||
|
centers = torch.empty((0, x.shape[1])).to(self.device)
|
||||||
|
for i in range(self.n_clusters):
|
||||||
|
mask = self.labels == i
|
||||||
|
cluster_samples = x[mask]
|
||||||
|
centers = torch.cat([centers, torch.mean(cluster_samples, (0)).unsqueeze(0)], (0))
|
||||||
|
self.centers = centers
|
||||||
|
|
||||||
|
def representative_sample(self):
|
||||||
|
# 查找距离中心点最近的样本,作为聚类的代表样本,更加直观
|
||||||
|
self.representative_samples = torch.argmin(self.dists, (0))
|
||||||
|
|
||||||
|
|
||||||
|
def time_clock(matrix, device):
|
||||||
|
a = time.time()
|
||||||
|
k = KMEANS(max_iter=10, verbose=False, device=device)
|
||||||
|
k.fit(matrix)
|
||||||
|
b = time.time()
|
||||||
|
return (b - a) / k.count
|
||||||
|
|
||||||
|
|
||||||
|
def choose_device(cuda=False):
|
||||||
|
if cuda:
|
||||||
|
device = torch.device("cuda:0")
|
||||||
|
else:
|
||||||
|
device = torch.device("cpu")
|
||||||
|
return device
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
plt.figure()
|
||||||
|
|
||||||
|
device = choose_device(False)
|
||||||
|
|
||||||
|
cpu_speeds = []
|
||||||
|
for i in tqdm([20, 100, 500, 2000, 8000, 20000]):
|
||||||
|
matrix = torch.rand((10000, i)).to(device)
|
||||||
|
speed = time_clock(matrix, device)
|
||||||
|
cpu_speeds.append(speed)
|
||||||
|
l1, = plt.plot([20, 100, 500, 2000, 8000, 20000], cpu_speeds, color='r', label='CPU')
|
||||||
|
|
||||||
|
device = choose_device(True)
|
||||||
|
|
||||||
|
gpu_speeds = []
|
||||||
|
for i in tqdm([20, 100, 500, 2000, 8000, 20000]):
|
||||||
|
matrix = torch.rand((10000, i)).to(device)
|
||||||
|
speed = time_clock(matrix, device)
|
||||||
|
gpu_speeds.append(speed)
|
||||||
|
l2, = plt.plot([20, 100, 500, 2000, 8000, 20000], gpu_speeds, color='g', label="GPU")
|
|
@ -0,0 +1,184 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-12-31T09:24:30.589401Z",
|
||||||
|
"start_time": "2024-12-31T09:24:30.582530Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"import shutil\n",
|
||||||
|
"\n",
|
||||||
|
"from sklearn.metrics import silhouette_score\n",
|
||||||
|
"from sklearn.cluster import KMeans\n",
|
||||||
|
"from tfidf import TFIDF\n",
|
||||||
|
"import os"
|
||||||
|
],
|
||||||
|
"id": "bba700bd32b05545",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": 22
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-12-31T08:43:07.638098Z",
|
||||||
|
"start_time": "2024-12-31T08:43:07.632218Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"filenames = os.listdir(\"../data/items/SV300\")\n",
|
||||||
|
"items = []\n",
|
||||||
|
"for filename in filenames:\n",
|
||||||
|
" with open(os.path.join(\"../data/items/SV300\", filename), \"r\", encoding=\"utf8\") as f:\n",
|
||||||
|
" items.extend(item.strip() for item in f.readlines() if item)\n",
|
||||||
|
"print(f\"参数数量: {len(items)}\")"
|
||||||
|
],
|
||||||
|
"id": "6744375bdcb289a9",
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"参数数量: 390\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"execution_count": 13
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-12-31T08:43:08.372427Z",
|
||||||
|
"start_time": "2024-12-31T08:43:08.314269Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"tfidf = TFIDF(1, 0.1, 300)\n",
|
||||||
|
"feature_names, tfidf_matrix = tfidf.fit(items)\n",
|
||||||
|
"print(feature_names)"
|
||||||
|
],
|
||||||
|
"id": "669c32f927f120d9",
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['amv' 'aprv' 'asv' 'atrc' 'auto' 'autoflow' 'bilevel' 'bipap' 'btps'\n",
|
||||||
|
" 'cmh2o' 'co2' 'cpap' 'cprv' 'duolevel' 'ibw' 'nif' 'p0.1' 'pbw' 'peep'\n",
|
||||||
|
" 'prvc' 'psv' 'sigh' 'simv' 'tft' 'trc' 'tve' 'usb' 'vc' 'vdaw' 'vent'\n",
|
||||||
|
" 'vt' 'vtalv' 'vte' '℃' '≤' '一体化' '一键' '上' '上升时间' '下' '两种' '中文' '主机'\n",
|
||||||
|
" '二氧化碳' '交叉感染' '交流' '人机' '传感器' '低' '低压' '体重' '使' '供电' '便利' '信息' '值' '值同屏'\n",
|
||||||
|
" '儿童' '充电电池' '内源性' '内置' '冻结' '减少' '出' '分析' '分级' '分辨率' '分钟' '判断' '剩余' '力学'\n",
|
||||||
|
" '功' '动态' '动态显示' '包含' '包括' '医护人员' '升级' '单位理想' '压' '双' '双相' '变化' '口腔' '可充电'\n",
|
||||||
|
" '可拆卸' '可调' '台' '台车' '叹息' '同屏' '同步' '同步性' '后备' '吸' '吸入' '吸呼' '吸氧' '吸痰' '呼'\n",
|
||||||
|
" '呼吸机' '呼比' '品牌' '回顾' '图' '图形化' '增氧' '声光报警' '复苏' '如自' '婴幼儿' '孔径' '存储'\n",
|
||||||
|
" '安全阀' '实时' '容积' '容量' '对比' '导出' '小儿' '小时' '屏' '屏幕' '屏幕显示' '峰值' '峰压' '工作'\n",
|
||||||
|
" '工具' '常用' '常规' '平台' '平均' '年' '年限' '开机' '张' '形态' '彩色' '待机' '心肺' '总' '患者'\n",
|
||||||
|
" '成人' '截图' '手动' '手提' '技术' '拓展' '持续' '指令' '指数' '接口' '控制' '控制屏' '提供' '提示'\n",
|
||||||
|
" '提示信息' '提高' '插管' '操作' '操作界面' '数据' '整机' '文字' '方式' '方案' '方波' '无创' '无需' '日志'\n",
|
||||||
|
" '时' '时间' '时间常数' '显示' '智能' '智能化' '最佳' '最小' '未来' '末' '末端' '机器' '机控' '条'\n",
|
||||||
|
" '标配' '检查' '检测' '模块' '模式' '次' '正压' '死腔' '气体' '气源' '气管' '气道' '气阀' '氧' '氧气'\n",
|
||||||
|
" '氧疗' '水平' '泄漏' '波' '波形' '流量' '浅快' '测定' '测试' '浓度' '消毒' '涡轮' '漏气' '潮气量'\n",
|
||||||
|
" '灌注' '灵敏度' '环' '环可' '环图' '理想' '电动' '电控' '电池' '电源' '电量' '界面' '病人' '百分比'\n",
|
||||||
|
" '监护' '监护仪' '盘' '目标' '直流' '确认' '种' '种环图' '科室' '空气' '窒息' '管路' '管道' '系统'\n",
|
||||||
|
" '纯氧' '组件' '肺' '肺复' '肺泡' '脱机' '自主' '自动' '自动识别' '自检' '至少' '舒适度' '英寸' '蒸汽'\n",
|
||||||
|
" '补偿' '表' '视图' '触发' '触摸' '计时' '计算' '认证' '记录' '设' '设定' '设置' '设计' '调节' '负压'\n",
|
||||||
|
" '趋势' '身高' '转运' '辅助' '过低' '过高' '选配' '递减' '道' '部件' '采用' '释放' '重量' '量' '锁'\n",
|
||||||
|
" '锂电池' '闭合' '间歇' '阻力' '阻抗' '雾化' '静态' '顺应性' '频率' '驱动' '高' '高压' '高温' '高温高压'\n",
|
||||||
|
" '高级']\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"C:\\Users\\wystan\\anaconda3\\envs\\TenderAutomateSystem\\Lib\\site-packages\\sklearn\\feature_extraction\\text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
|
||||||
|
" warnings.warn(\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"execution_count": 14
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-12-31T09:26:57.905444Z",
|
||||||
|
"start_time": "2024-12-31T09:26:57.811280Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# 定义聚类的数量\n",
|
||||||
|
"num_clusters = 50\n",
|
||||||
|
"kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n",
|
||||||
|
"kmeans.fit(tfidf_matrix)\n",
|
||||||
|
"labels = kmeans.labels_\n",
|
||||||
|
"\n",
|
||||||
|
"# 初始化结果目录\n",
|
||||||
|
"clusters_dir = \"../data/clusters/SV300\"\n",
|
||||||
|
"shutil.rmtree(clusters_dir)\n",
|
||||||
|
"os.makedirs(clusters_dir)\n",
|
||||||
|
"# 输出每个文档对应的聚类标签\n",
|
||||||
|
"for idx, label in enumerate(labels):\n",
|
||||||
|
" with open(os.path.join(clusters_dir, f\"{label}.txt\"), \"a\", encoding=\"utf8\") as f:\n",
|
||||||
|
" f.write(items[idx])\n",
|
||||||
|
" f.write(\"\\n\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 可选:计算轮廓系数以评估聚类效果\n",
|
||||||
|
"score = silhouette_score(tfidf_matrix, labels)\n",
|
||||||
|
"print(f\"轮廓系数: {score}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 查看每个聚类的中心词\n",
|
||||||
|
"order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]\n",
|
||||||
|
"for label in range(num_clusters):\n",
|
||||||
|
" top_terms = [feature_names[ind] for ind in order_centroids[label, :10]]\n",
|
||||||
|
" with open(os.path.join(clusters_dir, f\"{label}_words.txt\"), \"a\", encoding=\"utf8\") as f:\n",
|
||||||
|
" f.write(\", \".join(top_terms))\n"
|
||||||
|
],
|
||||||
|
"id": "c36c60812d8c902b",
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"轮廓系数: 0.3455008313963117\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"execution_count": 23
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "code",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": null,
|
||||||
|
"source": "",
|
||||||
|
"id": "5540d5575e1d5965"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 2
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython2",
|
||||||
|
"version": "2.7.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
|
@ -1,3 +1,4 @@
|
||||||
pandas==2.2.3
|
pandas==2.2.3
|
||||||
numpy==2.2.1
|
numpy==2.2.1
|
||||||
openpyxl==3.1.5
|
openpyxl==3.1.5
|
||||||
|
jieba==0.42.1
|
|
@ -0,0 +1,59 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
"""
|
||||||
|
@Project :recommender
|
||||||
|
@File :tf_idf.py
|
||||||
|
@IDE :PyCharm
|
||||||
|
@Author :rengengchen
|
||||||
|
@Time :2024/3/13 15:38
|
||||||
|
"""
|
||||||
|
import os.path
|
||||||
|
import re
|
||||||
|
|
||||||
|
import jieba
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
|
||||||
|
|
||||||
|
re_num = re.compile(r'\d+')
|
||||||
|
|
||||||
|
stop_words_file = os.path.join(os.path.dirname(__file__), 'ai/stop_words.txt')
|
||||||
|
with open(stop_words_file, mode='r', encoding='utf-8') as f:
|
||||||
|
stop_words = set(word.strip() for word in f.readlines())
|
||||||
|
|
||||||
|
|
||||||
|
def filter_word(words):
|
||||||
|
filtered_words = []
|
||||||
|
for word in words:
|
||||||
|
if re_num.match(word):
|
||||||
|
continue
|
||||||
|
if word in stop_words:
|
||||||
|
continue
|
||||||
|
filtered_words.append(word)
|
||||||
|
return filtered_words
|
||||||
|
|
||||||
|
|
||||||
|
def jieba_tokenize(text):
|
||||||
|
words = jieba.lcut(text)
|
||||||
|
words = filter_word(words)
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
class TFIDF:
|
||||||
|
def __init__(self, min_df, max_df, max_features):
|
||||||
|
self.min_df = min_df
|
||||||
|
self.max_df = max_df
|
||||||
|
self.max_features = max_features
|
||||||
|
|
||||||
|
def count_words(self, corpus):
|
||||||
|
cv = CountVectorizer(tokenizer=jieba_tokenize, min_df=self.min_df, max_df=self.max_df,
|
||||||
|
max_features=self.max_features)
|
||||||
|
return cv, cv.fit_transform(corpus)
|
||||||
|
|
||||||
|
def fit(self, corpus, get_feature_names=True):
|
||||||
|
cv, words_matrix = self.count_words(corpus)
|
||||||
|
|
||||||
|
tfidf_transformer = TfidfTransformer()
|
||||||
|
tfidf = tfidf_transformer.fit_transform(words_matrix)
|
||||||
|
if get_feature_names:
|
||||||
|
return cv.get_feature_names_out(), tfidf.toarray()
|
||||||
|
# row doc column word
|
||||||
|
return tfidf.toarray()
|
Loading…
Reference in New Issue