代理ip的抓取

Zss 发表于:

只抓取了一个页面中(第三页)连接速度和延时小的ip,并且使用它们来连接百度测试是否可用

很多地方可以参数化,可以专门写在一个文件中形成一个ip代理池,当ip失效时把他删除再更新一个新的ip

#coding:utf-8#
import requests
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def get_ip():
    url = 'http://www.xicidaili.com/nn/3'
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36'}
    ip_good_list=[]
    rsp = requests.get(url,headers=headers)
    xml = etree.HTML(rsp.content)
    ip = xml.xpath('//tr[@class="odd"]/td[2]/text()')
    ip_port = xml.xpath('//tr[@class="odd"]/td[3]/text()')
    fast = xml.xpath('//tr[@class="odd"]/td[7]/div/@title')
    ip_list = zip(ip,ip_port,fast)
    for i in ip_list:
        number = float(i[2].strip('秒'))
        if number<=1:
            ip_proxies = 'http://{}:{}'.format(str(i[0]),str(i[1]))
            print(ip_proxies)
            ip_good_list.append(ip_proxies)
    return ip_good_list

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36'}
ip_list = get_ip()
print(len(ip_list))
for i in ip_list:
    try:
        proxies={'http:':i}
        print(proxies)
        rsp = requests.get('http://www.baidu.com',proxies=proxies,headers=headers,timeout=4)
        print(rsp.status_code)
    except :
        print('失败')