如果你对该文章中的内容有疑问/不解,可以点击此处链接提问
要注明问题和此文章链接地址 点击此处跳转
sign token获取不到可测试手机端
百度翻译(案例)
<code class="">import requests
header = {
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Mobile Safari/537.36'}
data = {
'from': 'zh',
'to': 'en',
'query': '王明昌博客',
}
post_url = "http://fanyi.baidu.com/basetrans"
r = requests.post(post_url, data=data, headers=header)
print(r.content.decode())
</code>
获取简书个人中心文章

<code class="">import requests
from lxml import etree
import os
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
for i in range(1,4):
url_ = 'https://www.jianshu.com/u/4642b9fae22c?order_by=shared_at&page={}'.format(i)
res = requests.get(url_,headers=headers)
res = etree.HTML(res.content.decode())
nodes = res.xpath('//ul[@class="note-list"]/li')
for node in nodes:
item = {}
title = node.xpath('.//a[@class="title"]/text()')
time = node.xpath('.//span[@class="time"]/@data-shared-at')[0]
abstract = node.xpath('.//p[@class="abstract"]/text()')[0]
img = node.xpath('.//img[@class=" img-blur-done"]')
url = 'https://www.jianshu.com'+node.xpath('.//a/@href')[0]
item['title'] = title
item['time'] = time
item['url'] = url
item['abstract'] = title
item['img'] = time
print(item)
</code>
下载图片
<code class="">#_*_coding:utf-8_*_
import requests
import re
import os
class GetImage(object):
def __init__(self,url):
self.url = url
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
self.dir_path = os.path.dirname(os.path.abspath(__file__))
self.path = self.dir_path+'/imgs'
isExists = os.path.exists(self.dir_path+'/imgs')
# 创建目录
if not isExists:
os.makedirs(self.path)
def download(self,url):
try:
res = requests.get(url,headers=self.headers)
return res
except Exception as E:
print(url+'下载失败,原因:'+E)
def parse(self,res):
content = res.content.decode()
# print(content)
img_list = re.findall(r'<img.*?src="(.*?)"',content,re.S)
img_list = ['http://www.yangqq.com/skin/jxhx/'+url for url in img_list]
return img_list
def save(self,res_img,file_name):
if res_img:
with open(file_name,'wb') as f:
f.write(res_img.content)
print(url+'下载成功')
def run(self):
# 下载
res = self.download(self.url)
# 解析
url_list = self.parse(res)
# 下载图片
for url in url_list:
res_img = self.download(url)
name = url.strip().split('/').pop()
file_name = self.path+'/'+name
# 保存
self.save(res_img,file_name)
if __name__ == '__main__':
url_list = ['https://www.yangqq.com/skin/jxhx/', 'https://www.yangqq.com/skin/jxhx/list.html',
'https://www.yangqq.com/skin/jxhx/share.html', 'https://www.yangqq.com/skin/jxhx/list2.html',
'https://www.yangqq.com/skin/jxhx/list3.html', 'https://www.yangqq.com/skin/jxhx/daohang.html',
'https://www.yangqq.com/skin/jxhx/about.html']
for url in url_list:
text = GetImage(url)
text.run()
</code>
获取魔童降世影评,并保存CSV
<code class="">#_*_coding:utf-8_*_
import requests
import re
import csv
import time
from lxml import etree
def get_one_page(url):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'}
response = requests.get(url,headers=headers,timeout=10)
if response.status_code == 200:
# print(response.text)
return response
return None
except EOFError as e:
print(e)
return None
def parse_one_page(res,info):
info = []
res = etree.HTML(res.content.decode())
nodes_list = res.xpath('//div[@class="comment-item"]')
for node in nodes_list:
comic = {}
comic['User'] = node.xpath('.//span[@class="comment-info"]/a/text()')[0].strip()
comic['Time'] = node.xpath('.//span[@class="comment-info"]/span[3]/text()')[0].strip()
comic['Comment'] = node.xpath('.//span[@class="short"]/text()')[0].strip()
print(comic)
info.append(comic)
return info
def write_to_file(info):
with open('《哪吒之魔童降世》短评.csv','a',newline='') as f:
fieldnames = ['User','Time','Comment']
writer = csv.DictWriter(f,fieldnames=fieldnames)
writer.writeheader()
try:
writer.writerows(info)
except:
pass
def main(start):
info = {}
url = 'https://movie.douban.com/subject/26794435/comments?start=' + str(start) + '&limit=20&sort=new_score&status=P&percent_type='
html = get_one_page(url)
data = parse_one_page(html,info)
write_to_file(data)
if __name__ == '__main__':
for i in range(10):
main(i*20)
print('第{}本页采集完毕。'.format(str(i))) # 采集完一页后的标识
time.sleep(1) # 采集完一页休息一秒
</code>
代理
<code class="">proxies = {"http":"http://27.152.90.200:80"}
header = {}
request.get("http://www.baidu.com",proxies=proxies,headers=header)
</code>