104 lines
3.5 KiB
Python
104 lines
3.5 KiB
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding: UTF-8 -*-
|
|||
|
"""
|
|||
|
@Project :redbook
|
|||
|
@File :scraper.py
|
|||
|
@IDE :PyCharm
|
|||
|
@Author :rengengchen
|
|||
|
@Time :2024/4/17 11:53
|
|||
|
"""
|
|||
|
import datetime
|
|||
|
import random
|
|||
|
import time
|
|||
|
|
|||
|
from loguru import logger
|
|||
|
import requests
|
|||
|
|
|||
|
from config import HEAD, INFO_COLUMNS, RESULT_COLUMNS, RESULT_PATH, ENCODING, KEYWORDS
|
|||
|
|
|||
|
|
|||
|
def request_comment(aweme_id, cursor=0, count=20, comment_id=None, s1=1, s2=3):
|
|||
|
is_child = bool(comment_id)
|
|||
|
params = {'count': count, 'aid': 6383}
|
|||
|
if is_child:
|
|||
|
url = 'https://www.douyin.com/aweme/v1/web/comment/list/reply/'
|
|||
|
params['item_id'] = aweme_id
|
|||
|
params['comment_id'] = comment_id
|
|||
|
params['cursor'] = cursor
|
|||
|
else:
|
|||
|
url = 'https://www.douyin.com/aweme/v1/web/comment/list/'
|
|||
|
params['aweme_id'] = aweme_id
|
|||
|
params['cursor'] = cursor
|
|||
|
# url = 'https://www.douyin.com/aweme/v1/web/comment/list/?aweme_id=7353583858400398642&cursor=0&count=20'
|
|||
|
# r = requests.get(url, headers=HEAD)
|
|||
|
r = requests.get(url, headers=HEAD, params=params)
|
|||
|
respond = r.json()
|
|||
|
if respond['status_code'] == 0:
|
|||
|
if s2 > s1:
|
|||
|
gap = random.uniform(s1, s2)
|
|||
|
elif s1 == s2:
|
|||
|
gap = s1
|
|||
|
else:
|
|||
|
gap = 0
|
|||
|
time.sleep(gap)
|
|||
|
return respond
|
|||
|
logger.error(f'fail to request {url}')
|
|||
|
logger.error(respond)
|
|||
|
raise ConnectionError(f'fail to request {url}, respond: {respond}')
|
|||
|
|
|||
|
|
|||
|
def transform_comment(comment):
|
|||
|
dt_object = datetime.datetime.fromtimestamp(int(comment['create_time']))
|
|||
|
comment['create_time'] = dt_object.strftime('%Y-%m-%d %H:%M:%S')
|
|||
|
comment['uid'] = comment['user']['uid']
|
|||
|
comment['short_id'] = comment['user']['short_id']
|
|||
|
comment['nickname'] = comment['user']['nickname']
|
|||
|
comment['signature'] = comment['user']['signature']
|
|||
|
comment1 = {k: comment[k] for k in INFO_COLUMNS}
|
|||
|
comment1['ip_label'] = comment.get('ip_label', '')
|
|||
|
return comment1
|
|||
|
|
|||
|
|
|||
|
def parse_comment(comments):
|
|||
|
comments1 = []
|
|||
|
sub_comment_has_more = False
|
|||
|
subs = []
|
|||
|
for comment in comments:
|
|||
|
logger.debug(f'parse comment:\n{comment}')
|
|||
|
# 子评论没有这些属性
|
|||
|
reply_comment_total = comment.get('reply_comment_total', 0)
|
|||
|
if reply_comment_total:
|
|||
|
sub_comment_has_more = True
|
|||
|
subs.append((comment['aweme_id'], comment['cid']))
|
|||
|
have_word = not bool(len(KEYWORDS))
|
|||
|
for keyword in KEYWORDS:
|
|||
|
have_word = keyword in comment['text']
|
|||
|
if have_word:
|
|||
|
break
|
|||
|
if not have_word:
|
|||
|
continue
|
|||
|
|
|||
|
comment = transform_comment(comment)
|
|||
|
logger.debug(comment)
|
|||
|
comments1.append(comment)
|
|||
|
with open(f'{RESULT_PATH}/comments.csv', mode='a', encoding=ENCODING, errors='ignore') as f:
|
|||
|
for comment in comments1:
|
|||
|
f.write(','.join([str(comment[k]) for k in RESULT_COLUMNS]))
|
|||
|
f.write('\n')
|
|||
|
if sub_comment_has_more:
|
|||
|
logger.debug('load sub comment')
|
|||
|
for note_id, comment_id in subs:
|
|||
|
read_comment(note_id,
|
|||
|
comment_id=comment_id,
|
|||
|
cursor=0)
|
|||
|
return comments1
|
|||
|
|
|||
|
|
|||
|
def read_comment(aweme_id, comment_id=None, cursor=0, s1=1, s2=3):
|
|||
|
data = request_comment(aweme_id, cursor=cursor, comment_id=comment_id, s1=s1, s2=s2)
|
|||
|
parse_comment(data['comments'])
|
|||
|
while data['has_more']:
|
|||
|
logger.debug('load next page')
|
|||
|
data = request_comment(aweme_id, cursor=data['cursor'], comment_id=comment_id, s1=s1, s2=s2)
|
|||
|
parse_comment(data['comments'])
|