kuaishou/scraper.py

199 lines
7.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
@Project redbook
@File scraper.py
@IDE PyCharm
@Author rengengchen
@Time 2024/4/17 11:53
"""
import datetime
import random
import time
from loguru import logger
import requests
from config import HEAD, INFO_COLUMNS, RESULT_COLUMNS, RESULT_PATH, ENCODING, KEYWORDS
url = 'https://www.kuaishou.com/graphql'
def request_comment(photo_id, pcursor='', comment_id=None, s1=1, s2=3):
is_child = bool(comment_id)
if is_child:
params = (
r'{"query":"mutation visionSubCommentList($photoId: String, $rootCommentId: String, $pcursor: String) {\n'
r' visionSubCommentList(photoId: $photoId, rootCommentId: $rootCommentId, pcursor: $pcursor) {\n'
r' pcursor\n'
r' subComments {\n'
r' commentId\n'
r' authorId\n'
r' authorName\n'
r' content\n'
r' headurl\n'
r' timestamp\n'
r' likedCount\n'
r' realLikedCount\n'
r' liked\n'
r' status\n'
r' authorLiked\n'
r' replyToUserName\n'
r' replyTo\n'
r' __typename\n'
r' }\n'
r' __typename\n'
r' }\n'
r'}\n",'
'"variables":{'
f'"photoId":"{photo_id}","rootCommentId":"{comment_id}","pcursor":"{pcursor}"'
'}}')
comment_key = 'visionSubCommentList'
else:
params = (
r'{"query":"query commentListQuery($photoId: String, $pcursor: String) {\n'
r' visionCommentList(photoId: $photoId, pcursor: $pcursor) {\n'
r' commentCount\n'
r' pcursor\n'
r' rootComments {\n'
r' commentId\n'
r' authorId\n'
r' authorName\n'
r' content\n'
r' headurl\n'
r' timestamp\n'
r' likedCount\n'
r' realLikedCount\n'
r' liked\n'
r' status\n'
r' authorLiked\n'
r' subCommentCount\n'
r' subCommentsPcursor\n'
r' subComments {\n'
r' commentId\n'
r' authorId\n'
r' authorName\n'
r' content\n'
r' headurl\n'
r' timestamp\n'
r' likedCount\n'
r' realLikedCount\n'
r' liked\n'
r' status\n'
r' authorLiked\n'
r' replyToUserName\n'
r' replyTo\n'
r' __typename\n'
r' }\n'
r' __typename\n'
r' }\n'
r' __typename\n'
r' }\n'
r'}\n",'
'"variables":{'
f'"photoId":"{photo_id}","pcursor":"{pcursor}"'
'}}')
comment_key = 'visionCommentList'
r = requests.post(url, headers=HEAD, data=params)
respond = r.json()
try:
respond = respond['data'][comment_key]
except Exception:
logger.error(respond)
raise
if s2 > s1:
gap = random.uniform(s1, s2)
elif s1 == s2:
gap = s1
else:
gap = 0
time.sleep(gap)
return respond
def transform_comment(comment):
dt_object = datetime.datetime.fromtimestamp(int(comment['timestamp']) // 1000)
comment['create_time'] = dt_object.strftime('%Y-%m-%d %H:%M:%S')
comment['authorName'] = comment['authorName']
comment['authorId'] = f'https://www.kuaishou.com/profile/{comment["authorId"]}'
comment['replyTo'] = f'https://www.kuaishou.com/profile/{comment["replyTo"]}' if comment.get('replyTo') else ''
comment1 = {k: comment[k] for k in INFO_COLUMNS}
comment1['ip_label'] = comment.get('ip_label', '')
return comment1
def parse_comment(photo_id, comments, level):
comments1 = []
sub_comment_has_more = False
subs = []
for comment in comments:
logger.debug(f'parse comment:\n{comment}')
# 子评论没有这些属性
sub_cursor = comment.get('subCommentsPcursor', 'no_more')
sub_comments = comment.get('subComments', [])
if sub_cursor is not None and sub_cursor != 'no_more':
sub_comment_has_more = True
subs.append((comment['commentId'], sub_cursor))
# 是否包含指定关键词
have_word = not bool(len(KEYWORDS))
for keyword in KEYWORDS:
have_word = keyword in comment['content']
if have_word:
break
if not have_word:
continue
comment = transform_comment(comment)
comment['photo_id'] = f'https://www.kuaishou.com/short-video/{photo_id}'
comment['level'] = level
comments1.append(comment)
for sub_comment in sub_comments:
have_word = not bool(len(KEYWORDS))
for filter_word in KEYWORDS:
have_word = filter_word in sub_comment['content']
if have_word:
break
if not have_word:
continue
sub_comment1 = transform_comment(sub_comment)
sub_comment1['photo_id'] = f'https://www.kuaishou.com/short-video/{photo_id}'
sub_comment1['level'] = level + 1
logger.debug(sub_comment1)
comments1.append(sub_comment1)
with open(f'{RESULT_PATH}/comments.csv', mode='a', encoding=ENCODING, errors='ignore') as f:
for comment in comments1:
f.write(','.join([str(comment[k]) for k in RESULT_COLUMNS]))
f.write('\n')
if sub_comment_has_more:
logger.debug('load sub comment')
for comment_id, subCommentsPcursor in subs:
read_comment(photo_id,
comment_id=comment_id,
pcursor=subCommentsPcursor,
level=level + 1)
return comments1
def read_comment(photo_id, comment_id=None, pcursor='', level=1, s1=1, s2=3):
if comment_id:
logger.debug(f'load sub comment from {comment_id}')
comment_col = 'subComments'
else:
comment_col = 'rootComments'
comment_list = request_comment(photo_id, pcursor=pcursor, comment_id=comment_id, s1=s1, s2=s2)
pcursor = comment_list['pcursor']
logger.debug(comment_list)
parse_comment(photo_id, comment_list[comment_col], level=level)
while pcursor != 'no_more':
logger.debug('load next page')
comment_list = request_comment(photo_id, pcursor=pcursor, comment_id=comment_id, s1=s1, s2=s2)
logger.debug(comment_list)
try:
pcursor = comment_list['pcursor']
parse_comment(photo_id, comment_list[comment_col], level=level)
except Exception:
logger.error(comment_col)
logger.error(comment_list)
raise
logger.debug('done')