新手如何完成python视频爬虫

落寞瓜子壳 lv.3

发布时间：2023-01-14 22:54:44 357

相关标签： # python# 爬虫# json

作为新手小白学习爬虫，重要的就是实战经验，爬虫语言有多种，今天我们就通过python语言来做爬虫视频，下面的代码值得大家借鉴参考。

# coding=utf-8

import json
import os.path
import pprint

import requests


def get_page(pcursor):
    path = 'video/'
    if not os.path.exists(path):
        os.mkdir(path)
    # 爬取对象'https://www.kuaishou.com/profile/3xhv7zhkfr3rqag'
    """
    ctrl+r 批量替换
    https://www.kuaishou.com/short-video/3xw5fmcf9jdap29?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull

    https://www.kuaishou.com/short-video/3xf98wc5q2cuxtq?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull

    """

    url = 'https://www.kuaishou.com/graphql'
    headers = {
        'content-type': 'application/json',
        'Cookie': 'kpf=PC_WEB; kpn=KUAISHOU_VISION; clientid=3; did=web_72314bf978cb158dd7034b2370d2ae70',
        'Host': 'www.kuaishou.com',
        'Origin': 'https://www.kuaishou.com',
        'Referer': 'https://www.kuaishou.com/short-video/3x6v3xmcjsd5cki?authorId=3xhv7zhkfr3rqag&streamSource=profile&area=profilexxnull',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
    }
    data = {
        "operationName": "visionProfilePhotoList",
        "query": "query visionProfilePhotoList($pcursor: String, $userId: String, $page: String, $webPageArea: String) {\n  visionProfilePhotoList(pcursor: $pcursor, userId: $userId, page: $page, webPageArea: $webPageArea) {\n    result\n    llsid\n    webPageArea\n    feeds {\n      type\n      author {\n        id\n        name\n        following\n        headerUrl\n        headerUrls {\n          cdn\n          url\n          __typename\n        }\n        __typename\n      }\n      tags {\n        type\n        name\n        __typename\n      }\n      photo {\n        id\n        duration\n        caption\n        likeCount\n        realLikeCount\n        coverUrl\n        coverUrls {\n          cdn\n          url\n          __typename\n        }\n        photoUrls {\n          cdn\n          url\n          __typename\n        }\n        photoUrl\n        liked\n        timestamp\n        expTag\n        animatedCoverUrl\n        stereoType\n        videoRatio\n        profileUserTopPhoto\n        __typename\n      }\n      canAddComment\n      currentPcursor\n      llsid\n      status\n      __typename\n    }\n    hostName\n    pcursor\n    __typename\n  }\n}\n",
        "variables": {"userId": "3xhv7zhkfr3rqag", "pcursor": pcursor, "page": "detail", "webPageArea": "profilexxnull"}

    }
    rsp = requests.post(url=url, json=data, headers=headers)
    # 第一种方式转成json
    # json_data = json.loads(rsp.text)
    # 或者
    json_data = rsp.json()
    # print(json_data, type(json_data))
    url_list = json_data['data']['visionProfilePhotoList']['feeds']
    pcursor = json_data['data']['visionProfilePhotoList']['pcursor']
    # print(url_list)
    # pprint.pprint(url_list)

    for key in url_list:
        # 视屏标题
        title = key['photo']['caption']
        # print(title)
        # 视频url
        new_url = key['photo']['photoUrl']
        # print(title, new_url)
        # 发送请求
        content_data = requests.get(url=new_url).content
        # 保存目录
        with open(f'video/{title}.mp4', mode='wb') as f:
            f.write(content_data)
            print(f'=======================正在下载标题为 {title} 的快手短视频==========================')
    if pcursor != "no_more":
        get_page(pcursor)


get_page("")

文章来源： https://blog.51cto.com/u_13488918/5992184

特别声明：以上内容（图片及文字）均为互联网收集或者用户上传发布，本站仅提供信息存储服务！如有侵权或有涉及法律问题请联系我们。