init
This commit is contained in:
commit
5aa0ba84c1
|
@ -0,0 +1,8 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,60 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredPackages">
|
||||
<value>
|
||||
<list size="40">
|
||||
<item index="0" class="java.lang.String" itemvalue="cv2" />
|
||||
<item index="1" class="java.lang.String" itemvalue="greenlet" />
|
||||
<item index="2" class="java.lang.String" itemvalue="networkx" />
|
||||
<item index="3" class="java.lang.String" itemvalue="threadpoolctl" />
|
||||
<item index="4" class="java.lang.String" itemvalue="huggingface-hub" />
|
||||
<item index="5" class="java.lang.String" itemvalue="scikit-learn" />
|
||||
<item index="6" class="java.lang.String" itemvalue="MarkupSafe" />
|
||||
<item index="7" class="java.lang.String" itemvalue="numpy" />
|
||||
<item index="8" class="java.lang.String" itemvalue="torchvision" />
|
||||
<item index="9" class="java.lang.String" itemvalue="redis" />
|
||||
<item index="10" class="java.lang.String" itemvalue="fsspec" />
|
||||
<item index="11" class="java.lang.String" itemvalue="filelock" />
|
||||
<item index="12" class="java.lang.String" itemvalue="lit" />
|
||||
<item index="13" class="java.lang.String" itemvalue="safetensors" />
|
||||
<item index="14" class="java.lang.String" itemvalue="certifi" />
|
||||
<item index="15" class="java.lang.String" itemvalue="gevent" />
|
||||
<item index="16" class="java.lang.String" itemvalue="urllib3" />
|
||||
<item index="17" class="java.lang.String" itemvalue="itsdangerous" />
|
||||
<item index="18" class="java.lang.String" itemvalue="zope.event" />
|
||||
<item index="19" class="java.lang.String" itemvalue="sympy" />
|
||||
<item index="20" class="java.lang.String" itemvalue="Flask" />
|
||||
<item index="21" class="java.lang.String" itemvalue="tokenizers" />
|
||||
<item index="22" class="java.lang.String" itemvalue="scipy" />
|
||||
<item index="23" class="java.lang.String" itemvalue="transformers" />
|
||||
<item index="24" class="java.lang.String" itemvalue="triton" />
|
||||
<item index="25" class="java.lang.String" itemvalue="Werkzeug" />
|
||||
<item index="26" class="java.lang.String" itemvalue="tzdata" />
|
||||
<item index="27" class="java.lang.String" itemvalue="zope.interface" />
|
||||
<item index="28" class="java.lang.String" itemvalue="torch" />
|
||||
<item index="29" class="java.lang.String" itemvalue="click" />
|
||||
<item index="30" class="java.lang.String" itemvalue="pandas" />
|
||||
<item index="31" class="java.lang.String" itemvalue="tqdm" />
|
||||
<item index="32" class="java.lang.String" itemvalue="regex" />
|
||||
<item index="33" class="java.lang.String" itemvalue="mpmath" />
|
||||
<item index="34" class="java.lang.String" itemvalue="cmake" />
|
||||
<item index="35" class="java.lang.String" itemvalue="typing_extensions" />
|
||||
<item index="36" class="java.lang.String" itemvalue="charset-normalizer" />
|
||||
<item index="37" class="java.lang.String" itemvalue="redis-py-cluster" />
|
||||
<item index="38" class="java.lang.String" itemvalue="pytz" />
|
||||
<item index="39" class="java.lang.String" itemvalue="Pillow" />
|
||||
</list>
|
||||
</value>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||
<option name="ignoredErrors">
|
||||
<list>
|
||||
<option value="N802" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/douyin.iml" filepath="$PROJECT_DIR$/.idea/douyin.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,53 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project :redbook
|
||||
@File :config.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Time :2024/4/17 16:42
|
||||
"""
|
||||
import os
|
||||
|
||||
HEAD = {
|
||||
'authority': 'www.douyin.com',
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'cache-control': 'no-cache',
|
||||
'pragma': 'no-cache',
|
||||
'referer': 'https://www.douyin.com/',
|
||||
'sec-ch-ua': '"Google Chrome";v="113", "Chromium";v="113", "Not-A.Brand";v="24"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sec-fetch-dest': 'empty',
|
||||
'sec-fetch-mode': 'cors',
|
||||
'sec-fetch-site': 'same-origin',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
INFO_COLUMNS = ['nickname', 'text', 'cid', 'signature', 'uid', 'short_id', 'status', 'digg_count', 'create_time',
|
||||
'level', 'aweme_id', 'reply_id']
|
||||
RESULT_COLUMNS = ['nickname', 'text', 'ip_label', 'level', 'signature', 'uid', 'short_id', 'status', 'digg_count',
|
||||
'create_time', 'cid', 'aweme_id', 'reply_id']
|
||||
RESULT_COLUMNS_CH = ['昵称', '评论内容', 'ip归属地', '级别', '用户签名', '用户id', '用户短id', '评论状态', '红心数',
|
||||
'评论时间', '评论id', '视频id', '回复评论id']
|
||||
KEYWORDS = []
|
||||
RESULT_PATH = os.path.abspath('./')
|
||||
ENCODING = 'gbk'
|
||||
|
||||
|
||||
def modify_result_path(v):
|
||||
global RESULT_PATH
|
||||
RESULT_PATH = v
|
||||
|
||||
|
||||
def modify_filter_words(v):
|
||||
global KEYWORDS
|
||||
for v1 in v:
|
||||
if v1:
|
||||
KEYWORDS.append(v1)
|
||||
|
||||
|
||||
def modify_encoding(v):
|
||||
global ENCODING
|
||||
ENCODING = v
|
Binary file not shown.
|
@ -0,0 +1,4 @@
|
|||
from visualization import main
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,103 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project :redbook
|
||||
@File :scraper.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Time :2024/4/17 11:53
|
||||
"""
|
||||
import datetime
|
||||
import random
|
||||
import time
|
||||
|
||||
from loguru import logger
|
||||
import requests
|
||||
|
||||
from config import HEAD, INFO_COLUMNS, RESULT_COLUMNS, RESULT_PATH, ENCODING, KEYWORDS
|
||||
|
||||
|
||||
def request_comment(aweme_id, cursor=0, count=20, comment_id=None, s1=1, s2=3):
|
||||
is_child = bool(comment_id)
|
||||
params = {'count': count, 'aid': 6383}
|
||||
if is_child:
|
||||
url = 'https://www.douyin.com/aweme/v1/web/comment/list/reply/'
|
||||
params['item_id'] = aweme_id
|
||||
params['comment_id'] = comment_id
|
||||
params['cursor'] = cursor
|
||||
else:
|
||||
url = 'https://www.douyin.com/aweme/v1/web/comment/list/'
|
||||
params['aweme_id'] = aweme_id
|
||||
params['cursor'] = cursor
|
||||
# url = 'https://www.douyin.com/aweme/v1/web/comment/list/?aweme_id=7353583858400398642&cursor=0&count=20'
|
||||
# r = requests.get(url, headers=HEAD)
|
||||
r = requests.get(url, headers=HEAD, params=params)
|
||||
respond = r.json()
|
||||
if respond['status_code'] == 0:
|
||||
if s2 > s1:
|
||||
gap = random.uniform(s1, s2)
|
||||
elif s1 == s2:
|
||||
gap = s1
|
||||
else:
|
||||
gap = 0
|
||||
time.sleep(gap)
|
||||
return respond
|
||||
logger.error(f'fail to request {url}')
|
||||
logger.error(respond)
|
||||
raise ConnectionError(f'fail to request {url}, respond: {respond}')
|
||||
|
||||
|
||||
def transform_comment(comment):
|
||||
dt_object = datetime.datetime.fromtimestamp(int(comment['create_time']))
|
||||
comment['create_time'] = dt_object.strftime('%Y-%m-%d %H:%M:%S')
|
||||
comment['uid'] = comment['user']['uid']
|
||||
comment['short_id'] = comment['user']['short_id']
|
||||
comment['nickname'] = comment['user']['nickname']
|
||||
comment['signature'] = comment['user']['signature']
|
||||
comment1 = {k: comment[k] for k in INFO_COLUMNS}
|
||||
comment1['ip_label'] = comment.get('ip_label', '')
|
||||
return comment1
|
||||
|
||||
|
||||
def parse_comment(comments):
|
||||
comments1 = []
|
||||
sub_comment_has_more = False
|
||||
subs = []
|
||||
for comment in comments:
|
||||
logger.debug(f'parse comment:\n{comment}')
|
||||
# 子评论没有这些属性
|
||||
reply_comment_total = comment.get('reply_comment_total', 0)
|
||||
if reply_comment_total:
|
||||
sub_comment_has_more = True
|
||||
subs.append((comment['aweme_id'], comment['cid']))
|
||||
have_word = not bool(len(KEYWORDS))
|
||||
for keyword in KEYWORDS:
|
||||
have_word = keyword in comment['text']
|
||||
if have_word:
|
||||
break
|
||||
if not have_word:
|
||||
continue
|
||||
|
||||
comment = transform_comment(comment)
|
||||
logger.debug(comment)
|
||||
comments1.append(comment)
|
||||
with open(f'{RESULT_PATH}/comments.csv', mode='a', encoding=ENCODING, errors='ignore') as f:
|
||||
for comment in comments1:
|
||||
f.write(','.join([str(comment[k]) for k in RESULT_COLUMNS]))
|
||||
f.write('\n')
|
||||
if sub_comment_has_more:
|
||||
logger.debug('load sub comment')
|
||||
for note_id, comment_id in subs:
|
||||
read_comment(note_id,
|
||||
comment_id=comment_id,
|
||||
cursor=0)
|
||||
return comments1
|
||||
|
||||
|
||||
def read_comment(aweme_id, comment_id=None, cursor=0, s1=1, s2=3):
|
||||
data = request_comment(aweme_id, cursor=cursor, comment_id=comment_id, s1=s1, s2=s2)
|
||||
parse_comment(data['comments'])
|
||||
while data['has_more']:
|
||||
logger.debug('load next page')
|
||||
data = request_comment(aweme_id, cursor=data['cursor'], comment_id=comment_id, s1=s1, s2=s2)
|
||||
parse_comment(data['comments'])
|
|
@ -0,0 +1,15 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project :redbook
|
||||
@File :test.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Time :2024/4/18 16:29
|
||||
"""
|
||||
from config import HEAD
|
||||
from scraper import read_comment
|
||||
|
||||
if __name__ == '__main__':
|
||||
HEAD['Cookie'] = 'csrf_session_id=f872d4cc59b9b3567f6429acf934bf4b; ttwid=1%7CZAMrdfuJoRpuSHouTr75cQklBEY9IPW3XUhf0C9BtgA%7C1713515085%7C61a925f78225256635784994ccde4d14bcdfca5e2dc0b0a939af860662d00056; douyin.com; device_web_cpu_core=16; device_web_memory_size=8; architecture=amd64; dy_swidth=2560; dy_sheight=1080; volume_info=%7B%22isUserMute%22%3Afalse%2C%22isMute%22%3Atrue%2C%22volume%22%3A0.5%7D; passport_csrf_token=a5447b5452bc1dafa389902a91056fb0; passport_csrf_token_default=a5447b5452bc1dafa389902a91056fb0; FORCE_LOGIN=%7B%22videoConsumedRemainSeconds%22%3A180%7D; bd_ticket_guard_client_web_domain=2; s_v_web_id=verify_lv6epw65_q6p5E4Be_A9Rk_4vwx_BMzU_MstWXxVYayG6; d_ticket=34320b9fd407cd6de910027bb4a50e130f06f; passport_assist_user=CkFrSzNOucOIwCPoVjVdBm7IKO4P_BcCgkSWWUswg55yeJjCj9qgPjYvEsWTk53gXZIQWWuq91itr89yeUpFJ-WdOBpKCjwLc5WzwzHqFMOfeWVsVjMAgs_cHSavYZG7KpOF_FQhewLBYTj9h6Z8g54rQu6yeDcSmJFabRsH6YWkW4EQkojPDRiJr9ZUIAEiAQMvFz0_; n_mh=x5NlqPyT5w5qqosh0yYonaEQtMmT5CSwP_po6D8YEuA; sso_uid_tt=3e50eebbeb0499026fd0116ab315711b; sso_uid_tt_ss=3e50eebbeb0499026fd0116ab315711b; toutiao_sso_user=5a0173502653d0b4fdf57829f20b44b3; toutiao_sso_user_ss=5a0173502653d0b4fdf57829f20b44b3; sid_ucp_sso_v1=1.0.0-KDQxNjE0ZDM2ZjY3OTBkOWVjZjkzNGZiMzA1MmNmMTdmNjU0NTcxNTQKHwiL3ICJ_MywAhDd1YixBhjvMSAMMM2l8KMGOAZA9AcaAmxxIiA1YTAxNzM1MDI2NTNkMGI0ZmRmNTc4MjlmMjBiNDRiMw; ssid_ucp_sso_v1=1.0.0-KDQxNjE0ZDM2ZjY3OTBkOWVjZjkzNGZiMzA1MmNmMTdmNjU0NTcxNTQKHwiL3ICJ_MywAhDd1YixBhjvMSAMMM2l8KMGOAZA9AcaAmxxIiA1YTAxNzM1MDI2NTNkMGI0ZmRmNTc4MjlmMjBiNDRiMw; passport_auth_status=cef0c98be44102c4c5b553a90d5075e2%2C; passport_auth_status_ss=cef0c98be44102c4c5b553a90d5075e2%2C; uid_tt=de72d2018770fe5f74c60ce72f531123; uid_tt_ss=de72d2018770fe5f74c60ce72f531123; sid_tt=a78adf382808bc37850f09ec1a67560a; sessionid=a78adf382808bc37850f09ec1a67560a; sessionid_ss=a78adf382808bc37850f09ec1a67560a; publish_badge_show_info=%220%2C0%2C0%2C1713515231889%22; LOGIN_STATUS=1; store-region=cn-tw; store-region-src=uid; _bd_ticket_crypt_doamin=2; _bd_ticket_crypt_cookie=7dadb39961b0e985b2849f303181e7c7; __security_server_data_status=1; sid_guard=a78adf382808bc37850f09ec1a67560a%7C1713515233%7C5183999%7CTue%2C+18-Jun-2024+08%3A27%3A12+GMT; sid_ucp_v1=1.0.0-KDFiMTY2ZWY5YjBkZTFlNmM2MDEwZWM2ODVmN2IzNjE5ZDEwMDFmNGYKGwiL3ICJ_MywAhDh1YixBhjvMSAMOAZA9AdIBBoCbHEiIGE3OGFkZjM4MjgwOGJjMzc4NTBmMDllYzFhNjc1NjBh; ssid_ucp_v1=1.0.0-KDFiMTY2ZWY5YjBkZTFlNmM2MDEwZWM2ODVmN2IzNjE5ZDEwMDFmNGYKGwiL3ICJ_MywAhDh1YixBhjvMSAMOAZA9AdIBBoCbHEiIGE3OGFkZjM4MjgwOGJjMzc4NTBmMDllYzFhNjc1NjBh; pwa2=%220%7C0%7C1%7C0%22; home_can_add_dy_2_desktop=%220%22; download_guide=%222%2F20240419%2F0%22; IsDouyinActive=true; stream_recommend_feed_params=%22%7B%5C%22cookie_enabled%5C%22%3Atrue%2C%5C%22screen_width%5C%22%3A2560%2C%5C%22screen_height%5C%22%3A1080%2C%5C%22browser_online%5C%22%3Atrue%2C%5C%22cpu_core_num%5C%22%3A16%2C%5C%22device_memory%5C%22%3A8%2C%5C%22downlink%5C%22%3A10%2C%5C%22effective_type%5C%22%3A%5C%224g%5C%22%2C%5C%22round_trip_time%5C%22%3A100%7D%22; strategyABtestKey=%221713750618.261%22; stream_player_status_params=%22%7B%5C%22is_auto_play%5C%22%3A0%2C%5C%22is_full_screen%5C%22%3A0%2C%5C%22is_full_webscreen%5C%22%3A0%2C%5C%22is_mute%5C%22%3A1%2C%5C%22is_speed%5C%22%3A1%2C%5C%22is_visible%5C%22%3A1%7D%22; bd_ticket_guard_client_data=eyJiZC10aWNrZXQtZ3VhcmQtdmVyc2lvbiI6MiwiYmQtdGlja2V0LWd1YXJkLWl0ZXJhdGlvbi12ZXJzaW9uIjoxLCJiZC10aWNrZXQtZ3VhcmQtcmVlLXB1YmxpYy1rZXkiOiJCRXdNM21QWWExWUhhWWU0VldFWWU1emJnRGc2RUJGQ1ZiRERYV0dUYU9aeVlpMFlrNUordTR0N05LS2RycjB1K0JhUGdIT3I2L25zWTdGcTZOZ0pWc2c9IiwiYmQtdGlja2V0LWd1YXJkLXdlYi12ZXJzaW9uIjoxfQ%3D%3D; passport_fe_beating_status=true; xg_device_score=7.90435294117647; msToken=fQu54V1TK1lif2D4oq0OuukGD09MEUQUbf3mZyXpHtD12U08AgK53Bg7vnGhKEztfhtDbXqDhmRXHt7-FKx6ieD_Dd6jHunR5Fp4JsQnvlb352O-FBDRKQ==; odin_tt=27af7d3692306ec52501ed56c966ae255e15548c71e6d218488ba297334dbb1b216c774fe2c3ecc76a4957f242d3306feb50db521ba742bad134b3394bc2656e; odin_tt=835d1813f865a2cb0df4eabbe3049ec8ebd15124e16ace3ba890a117993f197817a49e38d5d35c9a1872567343689098'
|
||||
read_comment('7357654383712128283')
|
|
@ -0,0 +1,199 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: UTF-8 -*-
|
||||
"""
|
||||
@Project :redbook
|
||||
@File :visualize.py
|
||||
@IDE :PyCharm
|
||||
@Author :rengengchen
|
||||
@Time :2024/4/17 14:44
|
||||
"""
|
||||
import os
|
||||
import tkinter as tk
|
||||
import tkinter.font as tkFont
|
||||
from tkinter import filedialog
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from config import HEAD, RESULT_PATH, modify_result_path, modify_encoding, modify_filter_words, RESULT_COLUMNS_CH
|
||||
from scraper import read_comment
|
||||
|
||||
|
||||
# Function to handle button click
|
||||
def run():
|
||||
confirm_button.config(text='采集中')
|
||||
confirm_button['state'] = tk.DISABLED
|
||||
try:
|
||||
cookie = cookie_text.get("1.0", tk.END)
|
||||
HEAD['Cookie'] = cookie.strip()
|
||||
s1 = sec1.get("1.0", tk.END) # Retrieves text from the Text widget
|
||||
if s1.isnumeric():
|
||||
s1 = int(s1)
|
||||
else:
|
||||
s1 = 0
|
||||
s2 = sec2.get("1.0", tk.END)
|
||||
if s2.isnumeric():
|
||||
s2 = int(s2)
|
||||
if s2 < s1:
|
||||
s2 = s1
|
||||
else:
|
||||
s2 = s1
|
||||
encoding = encoding_text.get("1.0", tk.END)
|
||||
modify_encoding(encoding.strip())
|
||||
keywords_text.foc_in()
|
||||
filter_words = keywords_text.get("1.0", tk.END)
|
||||
modify_filter_words(filter_words.strip().split('\t'))
|
||||
note_link_text.foc_in()
|
||||
links = note_link_text.get("1.0", tk.END) # Retrieves text from the Text widget
|
||||
aweme_ids = []
|
||||
for link in links.split('\n'):
|
||||
link = link.strip()
|
||||
if link:
|
||||
link = link.strip()
|
||||
url = urlparse(link)
|
||||
aweme_id = url.path.split('/')[-1]
|
||||
aweme_ids.append(aweme_id)
|
||||
with open(f'{RESULT_PATH}/comments.csv', mode='w', encoding=encoding, errors='ignore') as f:
|
||||
f.write(','.join(RESULT_COLUMNS_CH))
|
||||
f.write('\n')
|
||||
for aweme_id in aweme_ids:
|
||||
logger.info(f'scrape {aweme_id}')
|
||||
read_comment(aweme_id, s1=int(s1), s2=int(s2))
|
||||
except Exception as e:
|
||||
result.config(text=f'\U0001F605fail!!!\U0001F605\n{e}', fg='green')
|
||||
raise e
|
||||
else:
|
||||
result.config(text='\U0001F618success!!!\U0001F618', fg='red')
|
||||
logger.info(f'over')
|
||||
finally:
|
||||
confirm_button.config(text='开始采集评论')
|
||||
confirm_button['state'] = tk.NORMAL
|
||||
|
||||
|
||||
def select_path():
|
||||
dir_path = filedialog.askdirectory()
|
||||
if dir_path:
|
||||
modify_result_path(dir_path)
|
||||
label_file.config(text="评论输出到指定目录下的comments.csv, 会覆盖已存在文件\n已指定存储目录: " + dir_path)
|
||||
|
||||
|
||||
class PlaceholderText(tk.Text):
|
||||
def __init__(self, master=None, placeholder="请输入文本", color='grey', **kwargs):
|
||||
super().__init__(master, **kwargs)
|
||||
self.placeholder = placeholder
|
||||
self.placeholder_color = color
|
||||
self.default_fg_color = self['fg']
|
||||
|
||||
self.insert("1.0", self.placeholder)
|
||||
self['fg'] = self.placeholder_color
|
||||
|
||||
self.bind("<FocusIn>", self.foc_in)
|
||||
self.bind("<FocusOut>", self.foc_out)
|
||||
|
||||
def foc_in(self, event=None):
|
||||
if self.get("1.0", "end-1c") == self.placeholder and self['fg'] == self.placeholder_color:
|
||||
self.delete("1.0", "end")
|
||||
self['fg'] = self.default_fg_color
|
||||
|
||||
def foc_out(self, event=None):
|
||||
if not self.get("1.0", "end-1c").strip():
|
||||
self.insert("1.0", self.placeholder)
|
||||
self['fg'] = self.placeholder_color
|
||||
|
||||
|
||||
# 创建主窗口
|
||||
root = tk.Tk()
|
||||
root.title('抖音评论采集器')
|
||||
root.minsize(width=850, height=650)
|
||||
|
||||
# Top frame for copyright
|
||||
top_frame = tk.Frame(root)
|
||||
top_frame.pack(fill=tk.X, padx=10, pady=10)
|
||||
copyright = tk.Label(top_frame, text='老板好', font=('微软雅黑', 30), fg='red')
|
||||
copyright.pack(side=tk.TOP)
|
||||
|
||||
# Define a font
|
||||
entry_font = tkFont.Font(family='微软雅黑', size=14)
|
||||
|
||||
# Middle frame for inputs
|
||||
middle_frame = tk.Frame(root)
|
||||
middle_frame.pack(fill=tk.X, padx=10, pady=5)
|
||||
|
||||
# 输入Cookie标签和文本框
|
||||
tk.Label(middle_frame, text='Cookie:', font=('微软雅黑', 14)).grid(row=0, column=0, sticky='w')
|
||||
cookie_text = tk.Text(middle_frame, width=72, height=1, font=entry_font)
|
||||
cookie_text.grid(row=0, column=1, padx=30, pady=5, columnspan=2, sticky='ew') # 确保横向填充
|
||||
|
||||
# Note link input with placeholder and scrollbar
|
||||
tk.Label(middle_frame, text='笔记链接, 多个笔记链接请换行输入:',
|
||||
font=('微软雅黑', 14)).grid(row=1, column=0, sticky='w', pady=5)
|
||||
note_link_text = PlaceholderText(middle_frame,
|
||||
placeholder="例:\n"
|
||||
"https://www.douyin.com/video/7357654383712128283\n"
|
||||
"https://www.douyin.com/video/7357654383712128283",
|
||||
font=entry_font, width=110, height=7)
|
||||
note_link_text.grid(row=2, column=0, sticky='ew', columnspan=2)
|
||||
scroll = tk.Scrollbar(middle_frame, command=note_link_text.yview)
|
||||
scroll.grid(row=2, column=2, sticky='ns')
|
||||
note_link_text.config(yscrollcommand=scroll.set)
|
||||
|
||||
# 停用词
|
||||
tk.Label(middle_frame, text='过滤停用词:',
|
||||
font=('微软雅黑', 14)).grid(row=3, column=0, sticky='w', pady=5)
|
||||
keywords_text = PlaceholderText(middle_frame,
|
||||
placeholder="过滤词之间用Tab键(制表键)分隔, 例: 老板\t送我\tStellar Blade豪华版\t爽一下",
|
||||
font=entry_font, width=3, height=1)
|
||||
keywords_text.grid(row=3, column=1, sticky='ew', columnspan=2)
|
||||
|
||||
# 重新配置Grid的列配置,使其更好地扩展
|
||||
middle_frame.grid_columnconfigure(1, weight=1)
|
||||
|
||||
# Middle frame for inputs
|
||||
middle_frame1 = tk.Frame(root)
|
||||
middle_frame1.pack(fill=tk.X, pady=5)
|
||||
# Label for interval
|
||||
tk.Label(middle_frame1, text='每次爬取间隔时间范围, 单位: 秒 (在区间内随机): ',
|
||||
font=('微软雅黑', 14)).grid(row=3, column=0, sticky='w', pady=5)
|
||||
sec1 = tk.Text(middle_frame1, width=3, height=1, font=entry_font)
|
||||
sec1.grid(row=3, column=1, sticky='w')
|
||||
sec1.insert('1.0', '1')
|
||||
tk.Label(middle_frame1, text='s', font=('微软雅黑', 14)).grid(row=3, column=2, sticky='w')
|
||||
tk.Label(middle_frame1, text='———', font=('微软雅黑', 14)).grid(row=3, column=3, sticky='ew', columnspan=2)
|
||||
sec2 = tk.Text(middle_frame1, width=3, height=1, font=entry_font)
|
||||
sec2.grid(row=3, column=5, sticky='w')
|
||||
sec2.insert('1.0', '3')
|
||||
tk.Label(middle_frame1, text='s', font=('微软雅黑', 14)).grid(row=3, column=6, sticky='w')
|
||||
tk.Label(middle_frame1, text='存储结果编码: ',
|
||||
font=('微软雅黑', 14)).grid(row=4, column=0, sticky='e', pady=5)
|
||||
encoding_text = tk.Text(middle_frame1, width=8, height=1, font=entry_font)
|
||||
encoding_text.grid(row=4, column=1, sticky='w', columnspan=3)
|
||||
encoding_text.insert('1.0', 'GBK')
|
||||
|
||||
# Bottom frame for settings and actions
|
||||
middle_frame2 = tk.Frame(root)
|
||||
middle_frame2.pack(fill=tk.X, padx=10, pady=5)
|
||||
# Directory and action buttons
|
||||
upload_button = tk.Button(middle_frame2, text="选择存储目录", command=select_path, padx=30, pady=5)
|
||||
upload_button.grid(row=1, column=0, padx=(100, 20), pady=5)
|
||||
label_file = tk.Label(middle_frame2, text='评论输出到指定目录下的comments.csv, 会覆盖已存在文件\n'
|
||||
f'默认输出目录: {RESULT_PATH}',
|
||||
font=('微软雅黑', 10), fg='blue')
|
||||
label_file.grid(row=1, column=1, padx=(5, 10), pady=5, sticky='w')
|
||||
|
||||
# Bottom frame for settings and actions
|
||||
bottom_frame = tk.Frame(root)
|
||||
bottom_frame.pack(fill=tk.X, padx=10, pady=5)
|
||||
confirm_button = tk.Button(bottom_frame, text='开始采集评论', command=run, padx=100, pady=10)
|
||||
confirm_button.pack(side=tk.TOP)
|
||||
result = tk.Label(bottom_frame, font=('微软雅黑', 10))
|
||||
result.pack(side=tk.TOP)
|
||||
|
||||
|
||||
def main():
|
||||
log_path = os.path.join(RESULT_PATH, 'logs', '{time:YYYY-MM-DD HH}.log')
|
||||
logger.add(log_path, rotation="1 hour", retention=12, compression="zip")
|
||||
root.mainloop()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue