xhs/visualization.py

192 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project redbook
@File visualize.py
@IDE PyCharm
@Author rengengchen
@Time 2024/4/17 14:44
"""
import os
import tkinter as tk
import tkinter.font as tkFont
from tkinter import filedialog
from urllib.parse import urlparse
from loguru import logger
from config import HEAD, RESULT_PATH, modify_result_path, modify_encoding, modify_filter_words
from scraper import read_comment
# Function to handle button click
def run():
confirm_button.config(text='采集中')
confirm_button['state'] = tk.DISABLED
try:
cookie = cookie_text.get("1.0", tk.END)
HEAD['Cookie'] = cookie.strip()
s1 = sec1.get("1.0", tk.END) # Retrieves text from the Text widget
s2 = sec2.get("1.0", tk.END)
encoding = encoding_text.get("1.0", tk.END)
modify_encoding(encoding.strip())
keywords_text.foc_in()
filter_words = keywords_text.get("1.0", tk.END)
modify_filter_words(filter_words.strip().split('\t'))
note_link_text.foc_in()
links = note_link_text.get("1.0", tk.END) # Retrieves text from the Text widget
note_ids = []
for link in links.split('\n'):
link = link.strip()
if link:
link = link.strip()
url = urlparse(link)
note_id = url.path.split('/')[-1]
note_ids.append(note_id)
with open(f'{RESULT_PATH}/comments.csv', mode='w', encoding='utf8', errors='ignore') as f:
f.write(','.join(
['nickname', 'content', 'ip_location', 'level', 'user_id', 'status', 'liked', 'create_time', 'note_id',
'id']))
f.write('\n')
for note_id in note_ids:
logger.info(f'scrape {note_id}')
read_comment(note_id, s1=int(s1), s2=int(s2))
except Exception as e:
result.config(text=f'\U0001F605fail!!!\U0001F605\n{e}', fg='green')
raise e
else:
result.config(text='\U0001F618success!!!\U0001F618', fg='red')
logger.info(f'over')
finally:
confirm_button.config(text='开始采集评论')
confirm_button['state'] = tk.NORMAL
def select_path():
dir_path = filedialog.askdirectory()
if dir_path:
modify_result_path(dir_path)
label_file.config(text="评论输出到指定目录下的comments.csv, 会覆盖已存在文件\n已指定存储目录: " + dir_path)
class PlaceholderText(tk.Text):
def __init__(self, master=None, placeholder="请输入文本", color='grey', **kwargs):
super().__init__(master, **kwargs)
self.placeholder = placeholder
self.placeholder_color = color
self.default_fg_color = self['fg']
self.insert("1.0", self.placeholder)
self['fg'] = self.placeholder_color
self.bind("<FocusIn>", self.foc_in)
self.bind("<FocusOut>", self.foc_out)
def foc_in(self, event=None):
if self.get("1.0", "end-1c") == self.placeholder and self['fg'] == self.placeholder_color:
self.delete("1.0", "end")
self['fg'] = self.default_fg_color
def foc_out(self, event=None):
if not self.get("1.0", "end-1c").strip():
self.insert("1.0", self.placeholder)
self['fg'] = self.placeholder_color
# 创建主窗口
root = tk.Tk()
root.title('小红书评论采集器')
root.minsize(width=850, height=650)
# Top frame for copyright
top_frame = tk.Frame(root)
top_frame.pack(fill=tk.X, padx=10, pady=10)
copyright = tk.Label(top_frame, text='老板好', font=('微软雅黑', 30), fg='red')
copyright.pack(side=tk.TOP)
# Define a font
entry_font = tkFont.Font(family='微软雅黑', size=14)
# Middle frame for inputs
middle_frame = tk.Frame(root)
middle_frame.pack(fill=tk.X, padx=10, pady=5)
# 输入Cookie标签和文本框
tk.Label(middle_frame, text='Cookie:', font=('微软雅黑', 14)).grid(row=0, column=0, sticky='w')
cookie_text = tk.Text(middle_frame, width=72, height=1, font=entry_font)
cookie_text.grid(row=0, column=1, padx=30, pady=5, columnspan=2, sticky='ew') # 确保横向填充
# Note link input with placeholder and scrollbar
tk.Label(middle_frame, text='笔记链接, 多个笔记链接请换行输入:',
font=('微软雅黑', 14)).grid(row=1, column=0, sticky='w', pady=5)
note_link_text = PlaceholderText(middle_frame,
placeholder="例:\n"
"https://www.xiaohongshu.com/explore/66174eea0000f00a1b00c6c6\n"
"https://www.xiaohongshu.com/explore/4517423a0000f00a1b00c6c6",
font=entry_font, width=110, height=7)
note_link_text.grid(row=2, column=0, sticky='ew', columnspan=2)
scroll = tk.Scrollbar(middle_frame, command=note_link_text.yview)
scroll.grid(row=2, column=2, sticky='ns')
note_link_text.config(yscrollcommand=scroll.set)
# 停用词
tk.Label(middle_frame, text='过滤停用词:',
font=('微软雅黑', 14)).grid(row=3, column=0, sticky='w', pady=5)
keywords_text = PlaceholderText(middle_frame,
placeholder="过滤词之间用Tab键(制表键)分隔, 例: 老板\t送我\tStellar Blade豪华版\t爽一下",
font=entry_font, width=3, height=1)
keywords_text.grid(row=3, column=1, sticky='ew', columnspan=2)
# 重新配置Grid的列配置使其更好地扩展
middle_frame.grid_columnconfigure(1, weight=1)
# Middle frame for inputs
middle_frame1 = tk.Frame(root)
middle_frame1.pack(fill=tk.X, pady=5)
# Label for interval
tk.Label(middle_frame1, text='每次爬取间隔时间范围, 单位: 秒 (在区间内随机): ',
font=('微软雅黑', 14)).grid(row=3, column=0, sticky='w', pady=5)
sec1 = tk.Text(middle_frame1, width=3, height=1, font=entry_font)
sec1.grid(row=3, column=1, sticky='w')
sec1.insert('1.0', '1')
tk.Label(middle_frame1, text='s', font=('微软雅黑', 14)).grid(row=3, column=2, sticky='w')
tk.Label(middle_frame1, text='———', font=('微软雅黑', 14)).grid(row=3, column=3, sticky='ew', columnspan=2)
sec2 = tk.Text(middle_frame1, width=3, height=1, font=entry_font)
sec2.grid(row=3, column=5, sticky='w')
sec2.insert('1.0', '3')
tk.Label(middle_frame1, text='s', font=('微软雅黑', 14)).grid(row=3, column=6, sticky='w')
tk.Label(middle_frame1, text='存储结果编码: ',
font=('微软雅黑', 14)).grid(row=4, column=0, sticky='e', pady=5)
encoding_text = tk.Text(middle_frame1, width=8, height=1, font=entry_font)
encoding_text.grid(row=4, column=1, sticky='w', columnspan=3)
encoding_text.insert('1.0', 'GBK')
# Bottom frame for settings and actions
middle_frame2 = tk.Frame(root)
middle_frame2.pack(fill=tk.X, padx=10, pady=5)
# Directory and action buttons
upload_button = tk.Button(middle_frame2, text="选择存储目录", command=select_path, padx=30, pady=5)
upload_button.grid(row=1, column=0, padx=(100, 20), pady=5)
label_file = tk.Label(middle_frame2, text='评论输出到指定目录下的comments.csv, 会覆盖已存在文件\n'
f'默认输出目录: {RESULT_PATH}',
font=('微软雅黑', 10), fg='blue')
label_file.grid(row=1, column=1, padx=(5, 10), pady=5, sticky='w')
# Bottom frame for settings and actions
bottom_frame = tk.Frame(root)
bottom_frame.pack(fill=tk.X, padx=10, pady=5)
confirm_button = tk.Button(bottom_frame, text='开始采集评论', command=run, padx=100, pady=10)
confirm_button.pack(side=tk.TOP)
result = tk.Label(bottom_frame, font=('微软雅黑', 10))
result.pack(side=tk.TOP)
def main():
log_path = os.path.join(RESULT_PATH, 'logs', '{time:YYYY-MM-DD HH}.log')
logger.add(log_path, rotation="1 hour", retention=12, compression="zip")
root.mainloop()
if __name__ == '__main__':
main()