妹子网多进程爬虫

Zss 发表于:
PS:使用的版本是python2.7,先创建一个文件用来保存图片,输入进程数量来并发下载
可以选择两种方式下载,选择1进入到指定页码段下载,选择2进入到按标签来下载,当初不熟悉class
所以没有使用到类来写,全是一堆def def def

exe文件:妹子网

-----------------------------------------------------------------

#coding:gbk#
import requests
import os
from lxml import etree
import sys
import time
from multiprocessing import Pool
reload(sys)
sys.setdefaultencoding('gbk')

def new_path():
    while 1:
        print('  -------------------------------------------------------- ')
        print('|              下载小姐姐图片需要指定目录               |')
        print('|   请输入一个目录,有则使用此目录,无则创建 O(∩_∩)O  |')
        print('|    例如-> C:/Zhu/One      C盘Zhu文件夹下One文件夹     |')
        print('|                                                       |')
        print('|                                   Author:One_One      |')
        print('  --------------------------------------------------------\n')
        path = raw_input()
        path_yes_or_no = os.path.exists(path)
        if path[0:3] in ['c:/','d:/','e:/','f:/','g:/','h:/','C:/','D:/','E:/','F:/','G:/','H:/']:
            if path_yes_or_no == True:
                print('   --------------------------------------------------')
                print('  |                   这个目录已存在                |')
                print('  |              小姐姐将保存在 {} 目录下        |'.format(str(path)))
                print('   --------------------------------------------------')
                return path
                break
            else:
                os.makedirs(path)
                print('   --------------------------------------------------')
                print('  |                 小姐姐目录创建成功              |')
                print('  |              小姐姐将保存在 {} 目录下        |'.format(str(path)))
                print('   --------------------------------------------------')
                return path
                break
        else:
            print('   --------------------------------------------------')
            print('  |               请输入正确格式的路径!             |')
            print('  |                   ‘(*>﹏<*)′ ~               |')
            print('   --------------------------------------------------\n\n')
            continue

def start():
    print('   --------------------------------------------------')
    print('  |                1:根据指定页码范围来下载         |')
    print('  |                2:根据标签名来下载               |')
    print('   --------------------------------------------------')
    while 1:
        print('  |                  请输入你的选择                 |')
        print('   --------------------------------------------------\n')
        choice = raw_input()
        if str(choice) == '1':
            #进入下载指定页码的图片
            choice_number = '1'
            return choice_number
            break
        if str(choice) == '2':
            #进入选择标题栏下载
            choice_number = '2'
            return choice_number
            break
        if str(choice) not in ['1','2']:
            print('   --------------------------------------------------')
            print('  |                  需要输入正确的选择!!!       |')
            print('   --------------------------------------------------\n\n')
            continue

def start_choose(url):  #获取总页码数返回
    rsp = requests.get(url).content
    xml = etree.HTML(rsp)
    total_nunber = xml.xpath('//*[@class="page"]/div/a[last()]/@href')[0]
    print('   --------------------------------------------------')
    print('  |  获取到总共有{}页小姐姐,每页小姐姐包含15个图册 |'.format(str(total_nunber[-2:])))
    print('  |              请输入你想下载的页码范围           |')
    print('  |                例如:1-1 1-23 4-67               |')
    print('   --------------------------------------------------\n\n')
    number = raw_input()
    down_page_number = list(range(int(number.split('-')[0]),int(number.split('-')[1])+2))
    return down_page_number

def set_page_url(page_id_list):
    page_list = []
    for i in page_id_list:
        i = 'http://www.mmjpg.com/home/{}'.format(str(i))
        page_list.append(i)
    return page_list

def set_atlas_points_url(points_url):#获取并构造每个分图集的url和title
    rsp = requests.get(points_url).content
    xml = etree.HTML(rsp)
    atlas_points_title_url = zip(xml.xpath('//ul/li/a/img/@alt'),xml.xpath('//ul/li/a/@href'))
    return atlas_points_title_url

def set_path_url(title_url,path,t):
    #print(title_url[0])
    headers1 = {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Host':'www.mmjpg.com',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
    }
    headers ={
    'Accept':'image/webp,image/apng,image/*,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'zh-CN,zh;q=0.9',
    'Connection':'keep-alive',
    'Host':'img.mmjpg.com',
    'Referer':'http://www.mmjpg.com',
    'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
}
    path = '{}/{}'.format(str(path),str(title_url[0]).encode('gbk'))#创建中文目录
    #print(path)
    path_yes_or_no = os.path.exists(path)
    if path_yes_or_no == False:
        os.makedirs(path)
        print('\n')
        print('创建对应的小姐姐文件夹成功')
        print('等待下载中...请稍后...')
        time.sleep(0.5)
        print('\n')
    rsp = requests.get(title_url[1]).content
    xml = etree.HTML(rsp)
    total_number = xml.xpath('//*[@id="page"]/a[7]/text()')[0]
    for t in range(1,int(total_number)+1): #总图片数
        try:
            url = '{}/{}'.format(str(title_url[1]),str(t))
            #print(url)
            p_path = '{}/{}.jpg'.format(str(path),str(t))
            #print(p_path)
            rsp = requests.get(url,headers=headers1,timeout=0.5).content
            xml2 = etree.HTML(rsp)
            p_path2 = xml2.xpath('//*[@id="content"]/a/img/@src')[0]
            #print(p_path2)
            time.sleep(0.15)
            p_rsp = requests.get(p_path2,headers=headers,timeout=0.5)
            #print(p_rsp.status_code)
            print('-->  -->  -->')
            print('{}. 正在下载标题为:{}中的第{}/{}张   p( ^ O ^ )q... \n'.format(str(t),str(title_url[0]),str(t),str(total_number)))
            #print(p_path2)
            with open(p_path,'wb') as file:
                file.write(p_rsp.content)
        except:
            print('\n')
            print('----------- 小姐姐下载失败了一张!!!+﹏+ ---------')
            print('\n')
            time.sleep(0.5)
            pass
#选项2
def get_title_url():
    headers = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding':'gzip, deflate',
        'Accept-Language':'zh-CN,zh;q=0.9',
        'Cache-Control':'max-age=0',
        'Connection':'keep-alive',
        'Host':'www.mmjpg.com',
        'Upgrade-Insecure-Requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119 Safari/537.36'
    }
    url = 'http://www.mmjpg.com/more/'
    rsp = requests.get(url=url,headers=headers).content
    xml = etree.HTML(rsp)
    title_url = xml.xpath('//ul/li/a/@href')
    title = xml.xpath('//ul/li/a/text()')
    #print(title_value)
    #print(len(title_value))
    number_title_value = zip(list(range(1,int(len(title_url)))),title,title_url)
    #print(number_title_value)    [(1, u'\u5218\u98de\u513f', 'http://www.mmjpg.com/tag/liufeier')]
    return number_title_value

def set_title_url(number_title_value,title_number):
    title_url = []
    for i in number_title_value:
        if str(i[0]) in title_number:
            title_url.append(i[2])
    return title_url

if __name__ == '__main__':
    t = 0
    path = new_path()
    number = raw_input('请输入进程数\n')
    choice = start()
    if choice == '1':  #进入指定页码下载
        page_id_list = start_choose(url='http://www.mmjpg.com/')#获取到需要下载的总页码
        page_url_list = set_page_url(page_id_list) #构造所有的页码url
        pool = Pool(int(number))
        #print(page_url_list)
        for i in page_url_list:
            atlas_points_title_url = set_atlas_points_url(i)
            for a in atlas_points_title_url:
                #print(a)
                t+=1
                pool.apply_async(set_path_url,(a,path,t))
                #set_path_url(a,path,t)
        pool.close()
        pool.join()
    elif choice == '2':
        title_number = []
        show_list = []
        number_title_value = get_title_url()
        print('   --------------------------------------------------')
        print('            已搜索出{}个标签,请选择标签号码       '.format(str(len(number_title_value))))
        for i in number_title_value:
            if (number_title_value.index(i)+1)%5 != 0:
                a = '{}:{}'.format(str(i[0]),str(i[1]))
                show_list.append(a)
            if  (number_title_value.index(i)+1)%5 == 0:
                a = '{}:{}'.format(str(i[0]),str(i[1]))
                show_list.append(a)
                print('    {}  {}  {}  {}   {}    '.format(show_list[0],show_list[1],show_list[2],show_list[3],show_list[4]))
                show_list = []
        print('            已搜索出{}个标签,请选择标签号码,       '.format(str(len(number_title_value))))
        print('      选择方法:标题号加回车   选择完毕后输入0退出选择 ')
        print('   --------------------------------------------------')
        while 1:
            y = input()
            if y == 0:
                break
            elif 0 < y <= int(len(number_title_value)):
                title_number.append(str(y))
                title_url = set_title_url(number_title_value,title_number)
        #print(title_url)
        #print(title_number)
        pool = Pool(int(number))
        for i in title_url:
            atlas_points_title_url = set_atlas_points_url(i)
            for a in atlas_points_title_url:
                #print(a)
                t+=1
                pool.apply_async(set_path_url,(a,path,t))
        pool.close()
        pool.join()
    else:
        print(' --------------------------------------------------')
        print('|              请输入正确的选择                  | ')
        print(' --------------------------------------------------\n')