192 lines
7.7 KiB
Python
192 lines
7.7 KiB
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding: UTF-8 -*-
|
|||
|
"""
|
|||
|
@Project :redbook
|
|||
|
@File :visualize.py
|
|||
|
@IDE :PyCharm
|
|||
|
@Author :rengengchen
|
|||
|
@Time :2024/4/17 14:44
|
|||
|
"""
|
|||
|
import os
|
|||
|
import tkinter as tk
|
|||
|
import tkinter.font as tkFont
|
|||
|
from tkinter import filedialog
|
|||
|
from urllib.parse import urlparse
|
|||
|
|
|||
|
from loguru import logger
|
|||
|
|
|||
|
from config import HEAD, RESULT_PATH, modify_result_path, modify_encoding, modify_filter_words
|
|||
|
from scraper import read_comment
|
|||
|
|
|||
|
|
|||
|
# Function to handle button click
|
|||
|
def run():
|
|||
|
confirm_button.config(text='采集中')
|
|||
|
confirm_button['state'] = tk.DISABLED
|
|||
|
try:
|
|||
|
cookie = cookie_text.get("1.0", tk.END)
|
|||
|
HEAD['Cookie'] = cookie.strip()
|
|||
|
s1 = sec1.get("1.0", tk.END) # Retrieves text from the Text widget
|
|||
|
s2 = sec2.get("1.0", tk.END)
|
|||
|
encoding = encoding_text.get("1.0", tk.END)
|
|||
|
modify_encoding(encoding.strip())
|
|||
|
keywords_text.foc_in()
|
|||
|
filter_words = keywords_text.get("1.0", tk.END)
|
|||
|
modify_filter_words(filter_words.strip().split('\t'))
|
|||
|
note_link_text.foc_in()
|
|||
|
links = note_link_text.get("1.0", tk.END) # Retrieves text from the Text widget
|
|||
|
note_ids = []
|
|||
|
for link in links.split('\n'):
|
|||
|
link = link.strip()
|
|||
|
if link:
|
|||
|
link = link.strip()
|
|||
|
url = urlparse(link)
|
|||
|
note_id = url.path.split('/')[-1]
|
|||
|
note_ids.append(note_id)
|
|||
|
with open(f'{RESULT_PATH}/comments.csv', mode='w', encoding='utf8', errors='ignore') as f:
|
|||
|
f.write(','.join(
|
|||
|
['nickname', 'content', 'ip_location', 'level', 'user_id', 'status', 'liked', 'create_time', 'note_id',
|
|||
|
'id']))
|
|||
|
f.write('\n')
|
|||
|
for note_id in note_ids:
|
|||
|
logger.info(f'scrape {note_id}')
|
|||
|
read_comment(note_id, s1=int(s1), s2=int(s2))
|
|||
|
except Exception as e:
|
|||
|
result.config(text=f'\U0001F605fail!!!\U0001F605\n{e}', fg='green')
|
|||
|
raise e
|
|||
|
else:
|
|||
|
result.config(text='\U0001F618success!!!\U0001F618', fg='red')
|
|||
|
logger.info(f'over')
|
|||
|
finally:
|
|||
|
confirm_button.config(text='开始采集评论')
|
|||
|
confirm_button['state'] = tk.NORMAL
|
|||
|
|
|||
|
|
|||
|
def select_path():
|
|||
|
dir_path = filedialog.askdirectory()
|
|||
|
if dir_path:
|
|||
|
modify_result_path(dir_path)
|
|||
|
label_file.config(text="评论输出到指定目录下的comments.csv, 会覆盖已存在文件\n已指定存储目录: " + dir_path)
|
|||
|
|
|||
|
|
|||
|
class PlaceholderText(tk.Text):
|
|||
|
def __init__(self, master=None, placeholder="请输入文本", color='grey', **kwargs):
|
|||
|
super().__init__(master, **kwargs)
|
|||
|
self.placeholder = placeholder
|
|||
|
self.placeholder_color = color
|
|||
|
self.default_fg_color = self['fg']
|
|||
|
|
|||
|
self.insert("1.0", self.placeholder)
|
|||
|
self['fg'] = self.placeholder_color
|
|||
|
|
|||
|
self.bind("<FocusIn>", self.foc_in)
|
|||
|
self.bind("<FocusOut>", self.foc_out)
|
|||
|
|
|||
|
def foc_in(self, event=None):
|
|||
|
if self.get("1.0", "end-1c") == self.placeholder and self['fg'] == self.placeholder_color:
|
|||
|
self.delete("1.0", "end")
|
|||
|
self['fg'] = self.default_fg_color
|
|||
|
|
|||
|
def foc_out(self, event=None):
|
|||
|
if not self.get("1.0", "end-1c").strip():
|
|||
|
self.insert("1.0", self.placeholder)
|
|||
|
self['fg'] = self.placeholder_color
|
|||
|
|
|||
|
|
|||
|
# 创建主窗口
|
|||
|
root = tk.Tk()
|
|||
|
root.title('小红书评论采集器')
|
|||
|
root.minsize(width=850, height=650)
|
|||
|
|
|||
|
# Top frame for copyright
|
|||
|
top_frame = tk.Frame(root)
|
|||
|
top_frame.pack(fill=tk.X, padx=10, pady=10)
|
|||
|
copyright = tk.Label(top_frame, text='老板好', font=('微软雅黑', 30), fg='red')
|
|||
|
copyright.pack(side=tk.TOP)
|
|||
|
|
|||
|
# Define a font
|
|||
|
entry_font = tkFont.Font(family='微软雅黑', size=14)
|
|||
|
|
|||
|
# Middle frame for inputs
|
|||
|
middle_frame = tk.Frame(root)
|
|||
|
middle_frame.pack(fill=tk.X, padx=10, pady=5)
|
|||
|
|
|||
|
# 输入Cookie标签和文本框
|
|||
|
tk.Label(middle_frame, text='Cookie:', font=('微软雅黑', 14)).grid(row=0, column=0, sticky='w')
|
|||
|
cookie_text = tk.Text(middle_frame, width=72, height=1, font=entry_font)
|
|||
|
cookie_text.grid(row=0, column=1, padx=30, pady=5, columnspan=2, sticky='ew') # 确保横向填充
|
|||
|
|
|||
|
# Note link input with placeholder and scrollbar
|
|||
|
tk.Label(middle_frame, text='笔记链接, 多个笔记链接请换行输入:',
|
|||
|
font=('微软雅黑', 14)).grid(row=1, column=0, sticky='w', pady=5)
|
|||
|
note_link_text = PlaceholderText(middle_frame,
|
|||
|
placeholder="例:\n"
|
|||
|
"https://www.xiaohongshu.com/explore/66174eea0000f00a1b00c6c6\n"
|
|||
|
"https://www.xiaohongshu.com/explore/4517423a0000f00a1b00c6c6",
|
|||
|
font=entry_font, width=110, height=7)
|
|||
|
note_link_text.grid(row=2, column=0, sticky='ew', columnspan=2)
|
|||
|
scroll = tk.Scrollbar(middle_frame, command=note_link_text.yview)
|
|||
|
scroll.grid(row=2, column=2, sticky='ns')
|
|||
|
note_link_text.config(yscrollcommand=scroll.set)
|
|||
|
|
|||
|
# 停用词
|
|||
|
tk.Label(middle_frame, text='过滤停用词:',
|
|||
|
font=('微软雅黑', 14)).grid(row=3, column=0, sticky='w', pady=5)
|
|||
|
keywords_text = PlaceholderText(middle_frame,
|
|||
|
placeholder="过滤词之间用Tab键(制表键)分隔, 例: 老板\t送我\tStellar Blade豪华版\t爽一下",
|
|||
|
font=entry_font, width=3, height=1)
|
|||
|
keywords_text.grid(row=3, column=1, sticky='ew', columnspan=2)
|
|||
|
|
|||
|
# 重新配置Grid的列配置,使其更好地扩展
|
|||
|
middle_frame.grid_columnconfigure(1, weight=1)
|
|||
|
|
|||
|
# Middle frame for inputs
|
|||
|
middle_frame1 = tk.Frame(root)
|
|||
|
middle_frame1.pack(fill=tk.X, pady=5)
|
|||
|
# Label for interval
|
|||
|
tk.Label(middle_frame1, text='每次爬取间隔时间范围, 单位: 秒 (在区间内随机): ',
|
|||
|
font=('微软雅黑', 14)).grid(row=3, column=0, sticky='w', pady=5)
|
|||
|
sec1 = tk.Text(middle_frame1, width=3, height=1, font=entry_font)
|
|||
|
sec1.grid(row=3, column=1, sticky='w')
|
|||
|
sec1.insert('1.0', '1')
|
|||
|
tk.Label(middle_frame1, text='s', font=('微软雅黑', 14)).grid(row=3, column=2, sticky='w')
|
|||
|
tk.Label(middle_frame1, text='———', font=('微软雅黑', 14)).grid(row=3, column=3, sticky='ew', columnspan=2)
|
|||
|
sec2 = tk.Text(middle_frame1, width=3, height=1, font=entry_font)
|
|||
|
sec2.grid(row=3, column=5, sticky='w')
|
|||
|
sec2.insert('1.0', '3')
|
|||
|
tk.Label(middle_frame1, text='s', font=('微软雅黑', 14)).grid(row=3, column=6, sticky='w')
|
|||
|
tk.Label(middle_frame1, text='存储结果编码: ',
|
|||
|
font=('微软雅黑', 14)).grid(row=4, column=0, sticky='e', pady=5)
|
|||
|
encoding_text = tk.Text(middle_frame1, width=8, height=1, font=entry_font)
|
|||
|
encoding_text.grid(row=4, column=1, sticky='w', columnspan=3)
|
|||
|
encoding_text.insert('1.0', 'GBK')
|
|||
|
|
|||
|
# Bottom frame for settings and actions
|
|||
|
middle_frame2 = tk.Frame(root)
|
|||
|
middle_frame2.pack(fill=tk.X, padx=10, pady=5)
|
|||
|
# Directory and action buttons
|
|||
|
upload_button = tk.Button(middle_frame2, text="选择存储目录", command=select_path, padx=30, pady=5)
|
|||
|
upload_button.grid(row=1, column=0, padx=(100, 20), pady=5)
|
|||
|
label_file = tk.Label(middle_frame2, text='评论输出到指定目录下的comments.csv, 会覆盖已存在文件\n'
|
|||
|
f'默认输出目录: {RESULT_PATH}',
|
|||
|
font=('微软雅黑', 10), fg='blue')
|
|||
|
label_file.grid(row=1, column=1, padx=(5, 10), pady=5, sticky='w')
|
|||
|
|
|||
|
# Bottom frame for settings and actions
|
|||
|
bottom_frame = tk.Frame(root)
|
|||
|
bottom_frame.pack(fill=tk.X, padx=10, pady=5)
|
|||
|
confirm_button = tk.Button(bottom_frame, text='开始采集评论', command=run, padx=100, pady=10)
|
|||
|
confirm_button.pack(side=tk.TOP)
|
|||
|
result = tk.Label(bottom_frame, font=('微软雅黑', 10))
|
|||
|
result.pack(side=tk.TOP)
|
|||
|
|
|||
|
|
|||
|
def main():
|
|||
|
log_path = os.path.join(RESULT_PATH, 'logs', '{time:YYYY-MM-DD HH}.log')
|
|||
|
logger.add(log_path, rotation="1 hour", retention=12, compression="zip")
|
|||
|
root.mainloop()
|
|||
|
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
main()
|