#!/usr/bin/env python # -*- coding: UTF-8 -*- """ @Project :redbook @File :scraper.py @IDE :PyCharm @Author :rengengchen @Time :2024/4/17 11:53 """ import datetime import random import time from loguru import logger import requests from config import HEAD, INFO_COLUMNS, RESULT_COLUMNS, RESULT_PATH, ENCODING, KEYWORDS def request_comment(aweme_id, cursor=0, count=20, comment_id=None, s1=1, s2=3): is_child = bool(comment_id) params = {'count': count, 'aid': 6383} if is_child: url = 'https://www.douyin.com/aweme/v1/web/comment/list/reply/' params['item_id'] = aweme_id params['comment_id'] = comment_id params['cursor'] = cursor else: url = 'https://www.douyin.com/aweme/v1/web/comment/list/' params['aweme_id'] = aweme_id params['cursor'] = cursor # url = 'https://www.douyin.com/aweme/v1/web/comment/list/?aweme_id=7353583858400398642&cursor=0&count=20' # r = requests.get(url, headers=HEAD) r = requests.get(url, headers=HEAD, params=params) respond = r.json() if respond['status_code'] == 0: if s2 > s1: gap = random.uniform(s1, s2) elif s1 == s2: gap = s1 else: gap = 0 time.sleep(gap) return respond logger.error(f'fail to request {url}') logger.error(respond) raise ConnectionError(f'fail to request {url}, respond: {respond}') def transform_comment(comment): dt_object = datetime.datetime.fromtimestamp(int(comment['create_time'])) comment['create_time'] = dt_object.strftime('%Y-%m-%d %H:%M:%S') comment['uid'] = comment['user']['uid'] comment['short_id'] = comment['user']['short_id'] comment['nickname'] = comment['user']['nickname'] comment['signature'] = comment['user']['signature'] comment1 = {k: comment[k] for k in INFO_COLUMNS} comment1['ip_label'] = comment.get('ip_label', '') return comment1 def parse_comment(comments): comments1 = [] sub_comment_has_more = False subs = [] for comment in comments: logger.debug(f'parse comment:\n{comment}') # 子评论没有这些属性 reply_comment_total = comment.get('reply_comment_total', 0) if reply_comment_total: sub_comment_has_more = True subs.append((comment['aweme_id'], comment['cid'])) have_word = not bool(len(KEYWORDS)) for keyword in KEYWORDS: have_word = keyword in comment['text'] if have_word: break if not have_word: continue comment = transform_comment(comment) logger.debug(comment) comments1.append(comment) with open(f'{RESULT_PATH}/comments.csv', mode='a', encoding=ENCODING, errors='ignore') as f: for comment in comments1: f.write(','.join([str(comment[k]) for k in RESULT_COLUMNS])) f.write('\n') if sub_comment_has_more: logger.debug('load sub comment') for note_id, comment_id in subs: read_comment(note_id, comment_id=comment_id, cursor=0) return comments1 def read_comment(aweme_id, comment_id=None, cursor=0, s1=1, s2=3): data = request_comment(aweme_id, cursor=cursor, comment_id=comment_id, s1=s1, s2=s2) parse_comment(data['comments']) while data['has_more']: logger.debug('load next page') data = request_comment(aweme_id, cursor=data['cursor'], comment_id=comment_id, s1=s1, s2=s2) parse_comment(data['comments'])