douyin/scraper.py

104 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project redbook
@File scraper.py
@IDE PyCharm
@Author rengengchen
@Time 2024/4/17 11:53
"""
import datetime
import random
import time
from loguru import logger
import requests
from config import HEAD, INFO_COLUMNS, RESULT_COLUMNS, RESULT_PATH, ENCODING, KEYWORDS
def request_comment(aweme_id, cursor=0, count=20, comment_id=None, s1=1, s2=3):
is_child = bool(comment_id)
params = {'count': count, 'aid': 6383}
if is_child:
url = 'https://www.douyin.com/aweme/v1/web/comment/list/reply/'
params['item_id'] = aweme_id
params['comment_id'] = comment_id
params['cursor'] = cursor
else:
url = 'https://www.douyin.com/aweme/v1/web/comment/list/'
params['aweme_id'] = aweme_id
params['cursor'] = cursor
# url = 'https://www.douyin.com/aweme/v1/web/comment/list/?aweme_id=7353583858400398642&cursor=0&count=20'
# r = requests.get(url, headers=HEAD)
r = requests.get(url, headers=HEAD, params=params)
respond = r.json()
if respond['status_code'] == 0:
if s2 > s1:
gap = random.uniform(s1, s2)
elif s1 == s2:
gap = s1
else:
gap = 0
time.sleep(gap)
return respond
logger.error(f'fail to request {url}')
logger.error(respond)
raise ConnectionError(f'fail to request {url}, respond: {respond}')
def transform_comment(comment):
dt_object = datetime.datetime.fromtimestamp(int(comment['create_time']))
comment['create_time'] = dt_object.strftime('%Y-%m-%d %H:%M:%S')
comment['uid'] = comment['user']['uid']
comment['short_id'] = comment['user']['short_id']
comment['nickname'] = comment['user']['nickname']
comment['signature'] = comment['user']['signature']
comment1 = {k: comment[k] for k in INFO_COLUMNS}
comment1['ip_label'] = comment.get('ip_label', '')
return comment1
def parse_comment(comments):
comments1 = []
sub_comment_has_more = False
subs = []
for comment in comments:
logger.debug(f'parse comment:\n{comment}')
# 子评论没有这些属性
reply_comment_total = comment.get('reply_comment_total', 0)
if reply_comment_total:
sub_comment_has_more = True
subs.append((comment['aweme_id'], comment['cid']))
have_word = not bool(len(KEYWORDS))
for keyword in KEYWORDS:
have_word = keyword in comment['text']
if have_word:
break
if not have_word:
continue
comment = transform_comment(comment)
logger.debug(comment)
comments1.append(comment)
with open(f'{RESULT_PATH}/comments.csv', mode='a', encoding=ENCODING, errors='ignore') as f:
for comment in comments1:
f.write(','.join([str(comment[k]) for k in RESULT_COLUMNS]))
f.write('\n')
if sub_comment_has_more:
logger.debug('load sub comment')
for note_id, comment_id in subs:
read_comment(note_id,
comment_id=comment_id,
cursor=0)
return comments1
def read_comment(aweme_id, comment_id=None, cursor=0, s1=1, s2=3):
data = request_comment(aweme_id, cursor=cursor, comment_id=comment_id, s1=s1, s2=s2)
parse_comment(data['comments'])
while data['has_more']:
logger.debug('load next page')
data = request_comment(aweme_id, cursor=data['cursor'], comment_id=comment_id, s1=s1, s2=s2)
parse_comment(data['comments'])