python爬虫+IP代理

warning: 这篇文章距离上次修改已过1050天,其中的内容可能已经有所变动。
"""
Created on Wed Nov 23 00:03:30 2022

@author: fch
"""
import time

'''
网站网址:https://m.7160.top/rentiyishu/
'''
import random
from pyquery import PyQuery as pq
import requests
import os
from joblib import Parallel, delayed
import socket
socket.setdefaulttimeout(10)


# 忽略requests证书警告
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

requests.DEFAULT_RETRIES = 5  # 增加重试连接次数

IP_list=[]
with open('./IP代理池.txt','r') as f:
    for line in f:
        line = line.strip('\n')  # 删除换行符
        IP_list.append(line)
f.close()

'''
random随机读取到后为字符串类型,用eval转回字典
'''
# proxie = eval(random.choice(IP_list))

user_agent_list = [
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0',
        'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
        'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
        'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
        'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
        'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
        'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
        'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
        'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10',
        'Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13',
        'Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+',
        'Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0',
        'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)',
        'UCWEB7.0.2.37/28/999',
        'NOKIA5700/ UCWEB7.0.2.37/28/999',
        'Openwave/ UCWEB7.0.2.37/28/999',
        'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999',
        'Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25',
    ]

def Res_url(url):
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    headers = {'User-Agent': random.choice(user_agent_list),
               'Referer': 'https://m.7160.top/rentiyishu/',
               "Connection": "close"}

    try:
        s=requests.session()
        s.keep_alive = False
        r = s.get(url, headers=headers, proxies=eval(random.choice(IP_list)), timeout=(5, 10),verify=False,allow_redirects=False)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print(url)
        return 'over'

'''
爬取首页中所有伪图片的链接地址和名称,返回成列表
'''
def Parse_page(html):
    page_url_list,fil_name_list=[],[]
    doc = pq(html)
    items = doc('.listUll2 li').items()
    for item in items:
        page_url_list.append(item('div a').attr('href'))
        fil_name_list.append(item('p').text())
    return page_url_list,fil_name_list


def Get_img(page_url, file_name):
    num = 1
    base_url = page_url.split('.')
    while True:
        if num == 1:
            url = page_url
        else:
            url = base_url[0] + '.' + base_url[1] + '.' + base_url[2] + '_' + str(num) + '.' + base_url[3]
        time.sleep(1)
        html = Res_url(url)
        if html != 'over':
            doc = pq(html)
            img_url = doc('.ArticleBox img').attr('src')
            img_basename = doc('.ArticleBox img').attr('alt')
            if img_basename==None:
                img_name=img_basename
            else:
                img_name=img_basename.replace('/','')
            if img_url == None:
                num += 1
            else:
                Save_img(img_url, file_name, img_name)
                num += 1
        else:
            break   #跳出while循环


def Save_img(img_url, file_name, img_name):
    time.sleep(1)
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
    headers = {'User-Agent': random.choice(user_agent_list),
               'Referer': 'https://m.7160.top/rentiyishu/',
               "Connection": "close"}
    houzui = img_url.split('.')[-1]
    addr = './images_3/' + file_name
    if not os.path.isdir(addr):
        os.makedirs(addr)
    r=requests.get(img_url, headers=headers, proxies=eval(random.choice(IP_list)), verify=False).content
    if r.status_code==200:
        with open('./images_3/{}/{}.{}'.format(file_name, img_name, houzui), 'wb') as f:
            f.write(r.content)
            f.close()
    else:
        print('图片地址不存在')


def main():
    page_num = 1
    base_url = 'https://m.7160.top/rentiyishu/index'
    while True:
        if page_num == 1:
            url = base_url + '.html'
        else:
            url = base_url + '_' + str(page_num) + '.html'
        html = Res_url(url)
        if html != 'over':
            print('保存第', page_num, '页')
            page_url_list, file_name_list=Parse_page(html)
            '''
            多线程:n_jobs=30即同时30个线程
            '''
            Parallel(n_jobs=5)(delayed(Get_img)(page_url,file_name) for page_url,file_name in zip(page_url_list,file_name_list))
        else:
            print('程序结束')
            return '程序结束'
        page_num += 1
main()

已有 15 条评论

  1. 不错不错,我喜欢看 www.jiwenlaw.com

  2. 兄弟写的非常好 https://www.cscnn.com/

  3. 《济公活佛1989》国产剧高清在线免费观看:https://www.jgz518.com/xingkong/127744.html

  4. 真好呢

  5. 哈哈哈,写的太好了https://www.lawjida.com/

  6. 哈哈哈,写的太好了https://www.lawjida.com/

  7. 作者以简洁明了的语言,传达了深刻的思想和情感。

  8. 建议引入反面案例,增强辩证性。

  9. 新盘新盘 这个月刚上新盘 新车第一个吃螃蟹!

  10. 2025年10月新盘 做第一批吃螃蟹的人coinsrore.com
    新车新盘 嘎嘎稳 嘎嘎靠谱coinsrore.com
    新车首发,新的一年,只带想赚米的人coinsrore.com
    新盘 上车集合 留下 我要发发 立马进裙coinsrore.com
    做了几十年的项目 我总结了最好的一个盘(纯干货)coinsrore.com
    新车上路,只带前10个人coinsrore.com
    新盘首开 新盘首开 征召客户!!!coinsrore.com
    新项目准备上线,寻找志同道合 的合作伙伴coinsrore.com
    新车即将上线 真正的项目,期待你的参与coinsrore.com
    新盘新项目,不再等待,现在就是最佳上车机会!coinsrore.com
    新盘新盘 这个月刚上新盘 新车第一个吃螃蟹!coinsrore.com

  11. 果博东方客服开户联系方式【182-8836-2750—】?薇- cxs20250806】
    果博东方公司客服电话联系方式【182-8836-2750—】?薇- cxs20250806】
    果博东方开户流程【182-8836-2750—】?薇- cxs20250806】
    果博东方客服怎么联系【182-8836-2750—】?薇- cxs20250806】

  12. 果博东方客服开户联系方式【182-8836-2750—】?薇- cxs20250806】
    果博东方公司客服电话联系方式【182-8836-2750—】?薇- cxs20250806】
    果博东方开户流程【182-8836-2750—】?薇- cxs20250806】
    果博东方客服怎么联系【182-8836-2750—】?薇- cxs20250806】

  13. 果博东方客服开户联系方式【182-8836-2750—】?薇- cxs20250806】
    果博东方公司客服电话联系方式【182-8836-2750—】?薇- cxs20250806】
    果博东方开户流程【182-8836-2750—】?薇- cxs20250806】
    果博东方客服怎么联系【182-8836-2750—】?薇- cxs20250806】

  14. 果博东方客服开户联系方式【182-8836-2750—】?薇- cxs20250806】
    果博东方公司客服电话联系方式【182-8836-2750—】?薇- cxs20250806】
    果博东方开户流程【182-8836-2750—】?薇- cxs20250806】
    果博东方客服怎么联系【182-8836-2750—】?薇- cxs20250806】

  15. 果博东方客服开户联系方式【182-8836-2750—】?薇- cxs20250806】
    果博东方公司客服电话联系方式【182-8836-2750—】?薇- cxs20250806】
    果博东方开户流程【182-8836-2750—】?薇- cxs20250806】
    果博东方客服怎么联系【182-8836-2750—】?薇- cxs20250806】

添加新评论