下载视频图片
import requests
import re
import os
from urllib.request import urlretrieve
# 下载视频
def downloadVideoImg():
host = 'https://www.pearvideo.com/'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
html = requests.get(host + 'category_8', headers=headers).text
# print(html)
reg = r'<a href="(.*?)" class="vervideo-lilink actplay">'
videoIds = re.findall(reg, html)
# print(videoIds)
for videoId in videoIds:
videoUrl = host + videoId
# print(videoUrl)
videoHtml = requests.get(videoUrl, headers=headers).text
# 视频名称
nameReg = r'<h1 class="video-tt">(.*?)</h1>'
videoName = re.findall(nameReg, videoHtml)
# print(videoName)
# 视频img
imgReg = r'data-picurl="(.*?)">'
videoImg = re.findall(imgReg, videoHtml)
print('正在下载视频图片:%s' % videoName[0])
path = 'images'
if path not in os.listdir():
os.mkdir(path)
# 下载图片
filePath = path + '/%s.jpg' % videoName[0]
urlretrieve(videoImg[0], filePath)
downloadVideoImg()
flask,requests 实现搜索功能
// search.py
from flask import Flask
from flask import render_template
from flask import request
from spider import getBaiDuSearchResult
app = Flask(__name__)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/s')
def search():
keyWord = request.args.get('wd')
pn = request.args.get('pn')
html = getBaiDuSearchResult(keyWord, pn)
return html
if __name__=='__main__':
app.run(debug=True)
// spider.py
import requests
def getBaiDuSearchResult(keyWord, pn):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
result = requests.get('https://www.baidu.com/s?wd={}&pn={}'.format(keyWord, pn), headers=headers).text
return result
// template/index.html
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport"
content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<title>搜索功能</title>
</head>
<body>
<form action="/s" method="get">
搜索:<input type="text" name="wd">
<input type="submit" value="提交">
</form>
</body>
</html>
requests lxml 下载”全小说”网小说
import requests
from lxml import etree
import os
# 小说下载
def novelDownload():
html = requests.get('https://qxs.la/179922/').text
doc = etree.HTML(html)
contents = doc.xpath('//*[@class="chapters"]/div')
# 新建存放小说目录
path = 'text'
if path not in os.listdir():
os.mkdir(path)
for content in contents:
links = content.xpath('//*[@class="chapter"]/a/@href')
for link in links:
url = 'https://qxs.la' + link
articleHtml = requests.get(url).text
articleDoc = etree.HTML(articleHtml)
articleContent = articleDoc.xpath('//*[@id="content"]/text()')
articleTitle = articleDoc.xpath('/html/body/div[3]/h1/text()')
# print(articleTitle, articleContent)
# 下载内容
print('下载中:' + articleTitle[0])
with open('text/%s.txt' % articleTitle[0], 'w', encoding='utf-8') as f:
for item in articleContent:
f.write(item)
novelDownload()