python 之爬虫开发实践

作者: tww844475003 分类: Python 发布时间: 2021-05-22 16:42

下载视频图片

import requests
import re
import os
from urllib.request import urlretrieve


# 下载视频
def downloadVideoImg():
    host = 'https://www.pearvideo.com/'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
    }

    html = requests.get(host + 'category_8', headers=headers).text
    # print(html)

    reg = r'<a href="(.*?)" class="vervideo-lilink actplay">'
    videoIds = re.findall(reg, html)
    # print(videoIds)

    for videoId in videoIds:
        videoUrl = host + videoId
        # print(videoUrl)
        videoHtml = requests.get(videoUrl, headers=headers).text
        # 视频名称
        nameReg = r'<h1 class="video-tt">(.*?)</h1>'
        videoName = re.findall(nameReg, videoHtml)
        # print(videoName)
        # 视频img
        imgReg = r'data-picurl="(.*?)">'
        videoImg = re.findall(imgReg, videoHtml)

        print('正在下载视频图片:%s' % videoName[0])
        path = 'images'
        if path not in os.listdir():
            os.mkdir(path)
        # 下载图片
        filePath = path + '/%s.jpg' % videoName[0]
        urlretrieve(videoImg[0], filePath)


downloadVideoImg()

flask,requests 实现搜索功能

// search.py
from flask import Flask
from flask import render_template
from flask import request
from spider import getBaiDuSearchResult

app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/s')
def search():
    keyWord = request.args.get('wd')
    pn = request.args.get('pn')
    html = getBaiDuSearchResult(keyWord, pn)
    return html

if __name__=='__main__':
    app.run(debug=True)


// spider.py
import requests

def getBaiDuSearchResult(keyWord, pn):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
    }
    result = requests.get('https://www.baidu.com/s?wd={}&pn={}'.format(keyWord, pn), headers=headers).text
    return result


// template/index.html
<!doctype html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport"
          content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0">
    <meta http-equiv="X-UA-Compatible" content="ie=edge">
    <title>搜索功能</title>
</head>
<body>
    <form action="/s" method="get">
        搜索:<input type="text" name="wd">
        <input type="submit" value="提交">
    </form>
</body>
</html>

requests lxml 下载”全小说”网小说

import requests
from lxml import etree
import os

# 小说下载
def novelDownload():
    html = requests.get('https://qxs.la/179922/').text
    doc = etree.HTML(html)
    contents = doc.xpath('//*[@class="chapters"]/div')

    # 新建存放小说目录
    path = 'text'
    if path not in os.listdir():
        os.mkdir(path)

    for content in contents:
        links = content.xpath('//*[@class="chapter"]/a/@href')
        for link in links:
            url = 'https://qxs.la' + link
            articleHtml = requests.get(url).text
            articleDoc = etree.HTML(articleHtml)
            articleContent = articleDoc.xpath('//*[@id="content"]/text()')
            articleTitle = articleDoc.xpath('/html/body/div[3]/h1/text()')
            # print(articleTitle, articleContent)

            # 下载内容
            print('下载中:' + articleTitle[0])
            with open('text/%s.txt' % articleTitle[0], 'w', encoding='utf-8') as f:
                for item in articleContent:
                    f.write(item)


novelDownload()

前端开发那点事
微信公众号搜索“前端开发那点事”

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注