如果你对该文章中的内容有疑问/不解,可以点击此处链接提问
要注明问题和此文章链接地址 点击此处跳转
过程
- 获取列表链接
- 打开链接获取详情
- 处理数据
- 保存数据
- 对数据进行处理
- 处理杂质
- 可视化数据
<code class="">import urllib
import pandas as pd
import requests
from lxml import etree
import os
import csv
import time
import pymysql
# 全局取消证书验证
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# 获取链接
def get_url_list(num):
all_num=0
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}
arr = range(1, num+1)
print(arr)
requests.adapters.DEFAULT_RETRIES = 5
# request 持久化
for i in arr:
url_ = 'http://www.***.com/photo/list/?page={}'.format(i)
res = requests.get(url_, headers=headers)
res = etree.HTML(res.content.decode())
nodes = res.xpath('//ul[@class="list-group"]/li/div/div/a')
info = []
k=0
for node in nodes:
item = {}
title = node.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/p/text()')[k]
url = node.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/@href')[k]
img = node.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/img/@data-backup')[k]
item['title'] = title
item['url'] = url
item['pic'] = img
item['pic_new'] = img.replace('http://img.***.com','http://oss1.wangmingchang.com/0bd86e854d29ca97c3510e774d9cd4d4/uploads',1)
item['status'] = 0
# info.append(item)
print(item)
db_insert(item)
all_num = all_num+1
k=k+1;
# print("当前第" + str(all_num) +"个")
# write_to_file(info)
print("====第"+ format(i)+"页已完成====")
# time.sleep( 2 )
print("OK")
# 保存数据
def write_to_file(info):
with open('采集链接列表.csv','a',newline='') as f:
fieldnames = ['title','url','img','status']
writer = csv.DictWriter(f,fieldnames=fieldnames)
writer.writeheader()
try:
writer.writerows(info)
except:
pass
# //清洗数据
def handle_data():
data = pd.read_csv("采集链接列表-2.csv") # 读取csv文件
# print(data) #打印所有文件
# print(data.head(5)) # 打印前5行
# print(data.columns) # 返回全部列名
# print(data.shape) # f返回csv文件形状(65, 5)
# print(data.loc[1:2]) # 打印第1到2行
# print(data.loc[2:4, ['title','time', 'url']]) # 打印行中特定列
# df = pd.DataFrame(data)
# print(df)
# df['title'].drop_duplicates(keep='last')
newdata = data.drop_duplicates(subset=['title', 'url'], keep='first')
print(newdata);
# 清洗后保存
# df = newdata.to_csv('清洗去重后的数据.csv', sep=',', header=True, index=True)
# print(df)
# save_img(url)
# 保存图片
def save_img(img_url):
#保存图片到磁盘文件夹 file_path中,默认为当前脚本运行目录下的 book\img文件夹
file_path = os.path.dirname(img_url)
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
try:
if not os.path.exists(file_path):
print("创建目录...")
os.makedirs(file_path)
#获得图片后缀
file_suffix = os.path.splitext(img_url)[1]
file_path2 = os.path.splitext(img_url)[0]
#拼接图片名(包含路径)
filename = '{}{}'.format(file_path2,file_suffix)
#下载图片,并保存到文件夹中
urllib.request.urlretrieve(img_url,filename=filename)
return 1
except IOError as e:
# return '文件操作失败',e
return 0
except Exception as e:
# return '错误 :',e
return 0
# 操作数据库
def db_insert(data):
# print("==执行数据库插入==")
# 选择TEST数据库,连接MySQL
db = pymysql.connect(host="localhost", port=3306, user="root", password="",db="py_test", charset="utf8")
cursor = db.cursor()
is_cz = db_is_cz(db,data)
if(is_cz==1):
# 这里的sql语句不能用%或者+号作为连接符,否则会报错
# params表需事先创建
sql = 'INSERT INTO list(title, pic,pic_new, url, status) VALUES (%s, %s, %s, %s,%s)'
parm = (data['title'], data['pic'], data['pic_new'], data['url'], data['status'])
try:
db.ping(reconnect=True)# 避免无法连接数据
cursor.execute(sql, parm)
db.commit()
print("ok")
except:
db.rollback()
print("error")
db.close()
else:
print("数据已存在..")
# 判断数据是否存在
def db_is_cz(db,data):
try:
with db.cursor() as cursor:
sql = 'select * from list where title= %s and url= %s and pic= %s'
parm = (data['title'], data['url'], data['pic'])
db.ping(reconnect=True)
cursor.execute(sql,parm)
result=cursor.fetchone()
if(result):
return 0
else:
return 1
except:
db.rollback()
print("error")
# finally:
db.close()
# 转换成DataFrame格式
# df = pd.DataFrame(result)
# print(df)
def test():
# for i in range(1, 5):
# print(i)
# # 1
# # 2
# # 3
# # 4
return 0
if __name__ == '__main__':
# handle_data()
# ret = save_img('http://img.***.com/production/uploads/image/2020/02/16/20200216833483_mpGroi.jpg')
# print(ret)
# str = 'http://img.***.com/production/uploads/image/2020/02/16/20200216833483_mpGroi.jpg'
# str = str.replace('http://img.***.com/',' ')
# print(str)
# db_cz()
list = get_url_list(5)
# test()
</code>