scrapy简单学习6—爬取百度贴吧图片(仿写向)

552 查看

主要是对上一篇文章的简单仿写,大家以后想批量下载什么图片照格式仿写就好。由于本人是tfboys的粉丝,所以平常没事爱逛贴吧欣赏我家三小只的美图,所以这次就以贴吧[小王的讨论楼]为例,批量爬取该楼的图片[1]

itme.py编写

import scrapy


class WangyuantuItem(scrapy.Item):
    image_urls=scrapy.Field()#就编写个图片路径就好

spider的编写

import scrapy
import requests
import os
from wangyuantu.items import WangyuantuItem

class XiaowangSpider(scrapy.Spider):
    name = "xiaowang"
    allowed_domains = ["tieba.baidu.com/p/3888309273"]
    start_urls = [
        'http://tieba.baidu.com/p/3888309273?pn=%d' % i for i in range(21,45)
        ]
    
    

    def parse(self, response):
        item = WangyuantuItem()
        item['image_urls']=response.xpath("//img[@class='BDE_Image']/@src").extract()
        
        yield item

pipelines编写:这个部分都是可以套用的

import requests
from wangyuantu import settings
import os

#图片下载类
class ImageDownloadPipeline(object):
    def process_item(self, item, spider):
        if 'image_urls' in item:#如何‘图片地址’在项目中
            images = []#定义图片空集
            
            dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)

            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            for image_url in item['image_urls']:
                us = image_url.split('/')[3:]
                image_file_name = '_'.join(us)
                file_path = '%s/%s' % (dir_path, image_file_name)
                images.append(file_path)
                if os.path.exists(file_path):
                    continue

                with open(file_path, 'wb') as handle:
                    response = requests.get(image_url, stream=True)
                    for block in response.iter_content(1024):
                        if not block:
                            break

settings编写

BOT_NAME = 'wangyuantu'

SPIDER_MODULES = ['wangyuantu.spiders']
NEWSPIDER_MODULE = 'wangyuantu.spiders'

ITEM_PIPELINES = {'wangyuantu.pipelines.ImageDownloadPipeline': 1}
#图片储存
IMAGES_STORE = 'C:\Users\Lenovo\Pictures'

结果

寄语:wili源源小可爱,希望你快快乐乐的长大


2018 - 知识虫 - 我的知识库 渝ICP备16002641号-2

渝公网安备 50010702501581号