限时免费试用:欢迎注册 api.bigmodel.org ,快速体验大模型 API 接入服务。
当前位置:首页 >开发者 >其他技术 >Python

py 采集流程整理

分类:Python时间:2020-02-16浏览:2348

过程

  1. 获取列表链接
  2. 打开链接获取详情
  3. 处理数据
  4. 保存数据
  5. 对数据进行处理
  6. 处理杂质
  7. 可视化数据
import urllib

import pandas as pd
import requests
from lxml import etree
import os
import csv
import time
import pymysql

# 全局取消证书验证
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


# 获取链接
def get_url_list(num):
    all_num=0
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}

    arr = range(1, num+1)
    print(arr)
    requests.adapters.DEFAULT_RETRIES = 5

    # request  持久化
    for i in arr:
        url_ = 'http://www.***.com/photo/list/?page={}'.format(i)
        res = requests.get(url_, headers=headers)
        res = etree.HTML(res.content.decode())
        nodes = res.xpath('//ul[@class="list-group"]/li/div/div/a')
        info = []
        k=0
        for node in nodes:
            item = {}
            title = node.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/p/text()')[k]
            url = node.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/@href')[k]
            img = node.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/img/@data-backup')[k]
            item['title'] = title
            item['url'] = url
            item['pic'] = img
            item['pic_new'] = img.replace('http://img.***.com','http://oss1.wangmingchang.com/0bd86e854d29ca97c3510e774d9cd4d4/uploads',1)
            item['status'] = 0
            # info.append(item)
            print(item)
            db_insert(item)
            all_num = all_num+1
            k=k+1;
            # print("当前第" +  str(all_num) +"个")
        # write_to_file(info)
        print("====第"+ format(i)+"页已完成====")
        # time.sleep( 2 )
    print("OK")

# 保存数据
def write_to_file(info):
    with open('采集链接列表.csv','a',newline='') as f:
       fieldnames = ['title','url','img','status']
       writer = csv.DictWriter(f,fieldnames=fieldnames)
       writer.writeheader()
       try:
           writer.writerows(info)
       except:
           pass

# //清洗数据
def handle_data():
    data = pd.read_csv("采集链接列表-2.csv")  # 读取csv文件
    # print(data)                                #打印所有文件
    # print(data.head(5))  # 打印前5行
    # print(data.columns)  # 返回全部列名
    # print(data.shape)  # f返回csv文件形状(65, 5)
    # print(data.loc[1:2])  # 打印第1到2行
    # print(data.loc[2:4, ['title','time', 'url']])  # 打印行中特定列

    # df = pd.DataFrame(data)
    # print(df)
    # df['title'].drop_duplicates(keep='last')
    newdata = data.drop_duplicates(subset=['title', 'url'], keep='first')

    print(newdata);

    # 清洗后保存
    # df = newdata.to_csv('清洗去重后的数据.csv', sep=',', header=True, index=True)
    # print(df)


        # save_img(url)

# 保存图片
def save_img(img_url):
    #保存图片到磁盘文件夹 file_path中,默认为当前脚本运行目录下的 book\img文件夹
    file_path = os.path.dirname(img_url)
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-Agent',
                          'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
    urllib.request.install_opener(opener)
    try:
        if not os.path.exists(file_path):
            print("创建目录...")
            os.makedirs(file_path)
        #获得图片后缀
        file_suffix = os.path.splitext(img_url)[1]
        file_path2 = os.path.splitext(img_url)[0]
        #拼接图片名(包含路径)
        filename = '{}{}'.format(file_path2,file_suffix)
       #下载图片,并保存到文件夹中
        urllib.request.urlretrieve(img_url,filename=filename)
        return 1
    except IOError as e:
        # return '文件操作失败',e
        return 0
    except Exception as e:
        # return '错误 :',e
        return 0
# 操作数据库
def db_insert(data):
    # print("==执行数据库插入==")
    # 选择TEST数据库,连接MySQL
    db = pymysql.connect(host="localhost", port=3306, user="root", password="",db="py_test", charset="utf8")
    cursor = db.cursor()
    is_cz = db_is_cz(db,data)
    if(is_cz==1):
        # 这里的sql语句不能用%或者+号作为连接符,否则会报错
        # params表需事先创建
        sql = 'INSERT INTO list(title, pic,pic_new, url, status) VALUES (%s, %s, %s, %s,%s)'
        parm = (data['title'], data['pic'], data['pic_new'], data['url'], data['status'])
        try:
            db.ping(reconnect=True)# 避免无法连接数据
            cursor.execute(sql, parm)
            db.commit()
            print("ok")
        except:
            db.rollback()
            print("error")
        db.close()
    else:
        print("数据已存在..")
#     判断数据是否存在
def db_is_cz(db,data):
    try:
        with db.cursor() as cursor:
            sql = 'select * from list where title= %s and url= %s and pic= %s'
            parm = (data['title'], data['url'], data['pic'])
            db.ping(reconnect=True)
            cursor.execute(sql,parm)
            result=cursor.fetchone()
            if(result):
                return 0
            else:
                return 1
    except:
        db.rollback()
        print("error")
    # finally:
        db.close()
    # 转换成DataFrame格式
    # df = pd.DataFrame(result)
    # print(df)

def test():
    # for i in range(1, 5):
    #     print(i)
    #     # 1
    #     # 2
    #     # 3
    #     # 4
    return 0

if __name__ == '__main__':
    # handle_data()
    # ret = save_img('http://img.***.com/production/uploads/image/2020/02/16/20200216833483_mpGroi.jpg')
    # print(ret)
    # str = 'http://img.***.com/production/uploads/image/2020/02/16/20200216833483_mpGroi.jpg'
    # str = str.replace('http://img.***.com/',' ')
    # print(str)
    # db_cz()
    list = get_url_list(5)




    # test()




本站文章如未注明出处均为原创,转载请注明出处,如有侵权请邮件联系站长。
0/500
Share your thoughts respectfully.