{ "cells": [ { "metadata": { "ExecuteTime": { "end_time": "2024-12-31T09:24:30.589401Z", "start_time": "2024-12-31T09:24:30.582530Z" } }, "cell_type": "code", "source": [ "import shutil\n", "\n", "from sklearn.metrics import silhouette_score\n", "from sklearn.cluster import KMeans\n", "from tfidf import TFIDF\n", "import os" ], "id": "bba700bd32b05545", "outputs": [], "execution_count": 22 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-31T08:43:07.638098Z", "start_time": "2024-12-31T08:43:07.632218Z" } }, "cell_type": "code", "source": [ "filenames = os.listdir(\"../data/items/SV300\")\n", "items = []\n", "for filename in filenames:\n", " with open(os.path.join(\"../data/items/SV300\", filename), \"r\", encoding=\"utf8\") as f:\n", " items.extend(item.strip() for item in f.readlines() if item)\n", "print(f\"参数数量: {len(items)}\")" ], "id": "6744375bdcb289a9", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "参数数量: 390\n" ] } ], "execution_count": 13 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-31T08:43:08.372427Z", "start_time": "2024-12-31T08:43:08.314269Z" } }, "cell_type": "code", "source": [ "tfidf = TFIDF(1, 0.1, 300)\n", "feature_names, tfidf_matrix = tfidf.fit(items)\n", "print(feature_names)" ], "id": "669c32f927f120d9", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['amv' 'aprv' 'asv' 'atrc' 'auto' 'autoflow' 'bilevel' 'bipap' 'btps'\n", " 'cmh2o' 'co2' 'cpap' 'cprv' 'duolevel' 'ibw' 'nif' 'p0.1' 'pbw' 'peep'\n", " 'prvc' 'psv' 'sigh' 'simv' 'tft' 'trc' 'tve' 'usb' 'vc' 'vdaw' 'vent'\n", " 'vt' 'vtalv' 'vte' '℃' '≤' '一体化' '一键' '上' '上升时间' '下' '两种' '中文' '主机'\n", " '二氧化碳' '交叉感染' '交流' '人机' '传感器' '低' '低压' '体重' '使' '供电' '便利' '信息' '值' '值同屏'\n", " '儿童' '充电电池' '内源性' '内置' '冻结' '减少' '出' '分析' '分级' '分辨率' '分钟' '判断' '剩余' '力学'\n", " '功' '动态' '动态显示' '包含' '包括' '医护人员' '升级' '单位理想' '压' '双' '双相' '变化' '口腔' '可充电'\n", " '可拆卸' '可调' '台' '台车' '叹息' '同屏' '同步' '同步性' '后备' '吸' '吸入' '吸呼' '吸氧' '吸痰' '呼'\n", " '呼吸机' '呼比' '品牌' '回顾' '图' '图形化' '增氧' '声光报警' '复苏' '如自' '婴幼儿' '孔径' '存储'\n", " '安全阀' '实时' '容积' '容量' '对比' '导出' '小儿' '小时' '屏' '屏幕' '屏幕显示' '峰值' '峰压' '工作'\n", " '工具' '常用' '常规' '平台' '平均' '年' '年限' '开机' '张' '形态' '彩色' '待机' '心肺' '总' '患者'\n", " '成人' '截图' '手动' '手提' '技术' '拓展' '持续' '指令' '指数' '接口' '控制' '控制屏' '提供' '提示'\n", " '提示信息' '提高' '插管' '操作' '操作界面' '数据' '整机' '文字' '方式' '方案' '方波' '无创' '无需' '日志'\n", " '时' '时间' '时间常数' '显示' '智能' '智能化' '最佳' '最小' '未来' '末' '末端' '机器' '机控' '条'\n", " '标配' '检查' '检测' '模块' '模式' '次' '正压' '死腔' '气体' '气源' '气管' '气道' '气阀' '氧' '氧气'\n", " '氧疗' '水平' '泄漏' '波' '波形' '流量' '浅快' '测定' '测试' '浓度' '消毒' '涡轮' '漏气' '潮气量'\n", " '灌注' '灵敏度' '环' '环可' '环图' '理想' '电动' '电控' '电池' '电源' '电量' '界面' '病人' '百分比'\n", " '监护' '监护仪' '盘' '目标' '直流' '确认' '种' '种环图' '科室' '空气' '窒息' '管路' '管道' '系统'\n", " '纯氧' '组件' '肺' '肺复' '肺泡' '脱机' '自主' '自动' '自动识别' '自检' '至少' '舒适度' '英寸' '蒸汽'\n", " '补偿' '表' '视图' '触发' '触摸' '计时' '计算' '认证' '记录' '设' '设定' '设置' '设计' '调节' '负压'\n", " '趋势' '身高' '转运' '辅助' '过低' '过高' '选配' '递减' '道' '部件' '采用' '释放' '重量' '量' '锁'\n", " '锂电池' '闭合' '间歇' '阻力' '阻抗' '雾化' '静态' '顺应性' '频率' '驱动' '高' '高压' '高温' '高温高压'\n", " '高级']\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\wystan\\anaconda3\\envs\\TenderAutomateSystem\\Lib\\site-packages\\sklearn\\feature_extraction\\text.py:517: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", " warnings.warn(\n" ] } ], "execution_count": 14 }, { "metadata": { "ExecuteTime": { "end_time": "2024-12-31T09:26:57.905444Z", "start_time": "2024-12-31T09:26:57.811280Z" } }, "cell_type": "code", "source": [ "# 定义聚类的数量\n", "num_clusters = 50\n", "kmeans = KMeans(n_clusters=num_clusters, random_state=42)\n", "kmeans.fit(tfidf_matrix)\n", "labels = kmeans.labels_\n", "\n", "# 初始化结果目录\n", "clusters_dir = \"../data/clusters/SV300\"\n", "shutil.rmtree(clusters_dir)\n", "os.makedirs(clusters_dir)\n", "# 输出每个文档对应的聚类标签\n", "for idx, label in enumerate(labels):\n", " with open(os.path.join(clusters_dir, f\"{label}.txt\"), \"a\", encoding=\"utf8\") as f:\n", " f.write(items[idx])\n", " f.write(\"\\n\")\n", "\n", "# 可选:计算轮廓系数以评估聚类效果\n", "score = silhouette_score(tfidf_matrix, labels)\n", "print(f\"轮廓系数: {score}\")\n", "\n", "# 查看每个聚类的中心词\n", "order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]\n", "for label in range(num_clusters):\n", " top_terms = [feature_names[ind] for ind in order_centroids[label, :10]]\n", " with open(os.path.join(clusters_dir, f\"{label}_words.txt\"), \"a\", encoding=\"utf8\") as f:\n", " f.write(\", \".join(top_terms))\n" ], "id": "c36c60812d8c902b", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "轮廓系数: 0.3455008313963117\n" ] } ], "execution_count": 23 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "5540d5575e1d5965" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }