摘要:汽車之家車型的簡單爬取名字自定義配置重新定義起始爬取點所有首字母按照首字母,組合對應的頁面,壓入根據,抓取頁面定義默認的抓取函數品牌編號品牌名品牌品牌小類別品牌小類別對應的頁面品牌小類別的編號品牌小類別名品牌小類別對應的頁面的
汽車之家車型的簡單爬取
spider
# -*- coding: utf-8 -*- import scrapy from scrapy import Request from mininova.items import carItem import sys reload(sys) sys.setdefaultencoding("utf8") class SplashSpider(scrapy.Spider): #spider名字 name = "car_home" allowed_domains = ["autohome.com.cn"] start_urls = [ ] # 自定義配置 custom_settings = { "ITEM_PIPELINES": { "mininova.pipelines.CarPipeline": 300, } } def start_requests(self): #重新定義起始爬取點 #所有首字母 words = ["A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z"] #按照首字母,組合對應的頁面,壓入start_urls for word in words: self.start_urls.append("https://www.autohome.com.cn/grade/carhtml/"+word+".html") #根據start_urls,抓取頁面 for url in self.start_urls: yield Request(url,meta={"word":word}) #定義默認的抓取函數 def parse(self, response): print("url") print(response.url) word = response.meta["word"] car_cates = response.xpath("http://dl").extract() brand_id = 0 total_cars = [] for brand_index in range(len(car_cates)): #品牌編號 brand_num = brand_index + 1 brand_num = str(brand_num) #品牌名 brand = response.xpath("http://dl["+brand_num+"]/dt/div[1]/a/text()").extract()[0] print("brand:"+brand) #品牌logo brand_logo_url = response.xpath("http://dl["+brand_num+"]/dt//img[1]/@src").extract()[0] #品牌小類別 brand_items = response.xpath("http://dl["+brand_num+"]/dd//div[@class="h3-tit"]/a/text()").extract() #品牌小類別對應的頁面 brand_item_urls = response.xpath("http://dl["+brand_num+"]/dd//div[@class="h3-tit"]/a/@href").extract() for brand_item_index in range(len(brand_items)): #品牌小類別的編號 brand_item_num = brand_item_index + 1 brand_item_num = str(brand_item_num) #品牌小類別名 brand_item = brand_items[brand_item_index] #品牌小類別對應的頁面的url brand_item_url = brand_item_urls[brand_item_index] print("brand_item:"+brand_item) print("brand_item_url:"+brand_item_url) #品牌小類別中的所有車 cars = response.xpath("http://dl["+brand_num+"]/dd//ul[@class="rank-list-ul"]["+brand_item_num+"]/li[@id]").extract() print("cars_count:"+str(len(cars))) for car_index in range(len(cars)): car_num = car_index + 1 car_num = str(car_num) #具體車的名稱 name = response.xpath("http://dl["+brand_num+"]/dd//ul[@class="rank-list-ul"]["+brand_item_num+"]/li[@id]["+car_num+"]/h4/a/text()").extract()[0] #車對應的頁面 url = response.xpath("http://dl["+brand_num+"]/dd//ul[@class="rank-list-ul"]["+brand_item_num+"]/li[@id]["+car_num+"]/h4/a/@href").extract()[0] #報價(最低價-最高價) price = response.xpath("http://dl["+brand_num+"]/dd//ul[@class="rank-list-ul"]["+brand_item_num+"]/li[@id]["+car_num+"]/div[1]/a/text()").extract()[0] prices = price.split("-") price_base = "萬" if len(prices) != 2: max_price = "暫無" min_price = "暫無" else: max_price = str(prices[1].replace(price_base,"")) min_price = str(prices[0]) print("car:"+name+" max_price:"+str(max_price)+" min_price:"+str(min_price)+" price_base:"+price_base) car_item = carItem() car_item["name"] = name car_item["url"] = url car_item["brand_item"] = brand_item car_item["first_word"] = word car_item["brand"] = brand car_item["brand_logo_url"] = brand_logo_url car_item["max_price"] = max_price car_item["min_price"] = min_price total_cars.append(car_item) return total_cars
item
# -*- coding: utf-8 -*- import scrapy class carItem(scrapy.Item): #具體車名 name = scrapy.Field() #對應的介紹頁面url url = scrapy.Field() #最高報價,單位(萬) max_price = scrapy.Field() #最低報價,單位(萬) min_price = scrapy.Field() #品牌名 brand = scrapy.Field() #品牌logo brand_logo_url = scrapy.Field() #品牌小類別名 brand_item = scrapy.Field() #品牌首字母 first_word = scrapy.Field()
mongo_car
from mininova.mongodb import Mongo from mininova.settings import mongo_setting class MongoCar(): db_name = "car" brand_set_name = "brand" brand_item_set_name = "brand_item" car_set_name = "car" def __init__(self): self.db = Mongo(mongo_setting["mongo_host"],mongo_setting["mongo_port"],mongo_setting["mongo_user"],mongo_setting["mongo_password"]) def insert(self,item): brand_where = {"name":item["brand"]} brand = self.brand_exist(self.db,brand_where) if brand == False: brand = {"name":item["brand"],"first_word":item["first_word"]} brand = self.insert_brand(self.db,brand) print("brand insert ok!") else: brand = {"name":item["brand"],"first_word":item["first_word"],"logo_url":item["brand_logo_url"]} brand = self.update_brand(self.db,brand_where,brand) print("brand_exist!") brand_item_where = {"name":item["brand_item"]} brand_item = self.brand_item_exist(self.db,brand_item_where) if brand_item == False: brand_item = {"name":item["brand_item"],"first_word":item["first_word"],"brand_id":brand["_id"]} brand_item = self.insert_brand_item(self.db,brand_item) print("brand_item insert ok!") else: print("brand_item_exist!") car_where = {"name":item["brand_item"],"name":item["name"]} car = self.car_exist(self.db,car_where) if car == False: car = {"name":item["name"],"url":item["url"],"max_price":item["max_price"],"min_price":item["min_price"],"first_word":item["first_word"],"brand_id":brand["_id"],"brand_item_id":brand_item["_id"]} car = self.insert_car(self.db,car) print("car insert ok!") else: print("car_exist!") if car != False: return True; else: return False; def update_brand(self,db,brand_where,brand): my_set = db.set(self.db_name,self.brand_set_name) my_set.update_one(brand_where,{"$set":brand}) exist = my_set.find_one(brand_where) if(exist is None): return False else: return exist def brand_exist(self,db,brand): my_set = db.set(self.db_name,self.brand_set_name) exist = my_set.find_one(brand) if(exist is None): return False else: return exist def insert_brand(self,db,brand): my_set = db.set(self.db_name,self.brand_set_name) my_set.insert_one(brand) brand = my_set.find_one(brand) return brand def brand_item_exist(self,db,brand_item): my_set = db.set(self.db_name,self.brand_item_set_name) exist = my_set.find_one(brand_item) if(exist is None): return False else: return exist def insert_brand_item(self,db,brand_item): my_set = db.set(self.db_name,self.brand_item_set_name) my_set.insert_one(brand_item) brand = my_set.find_one(brand_item) return brand def car_exist(self,db,car): my_set = db.set(self.db_name,self.car_set_name) exist = my_set.find_one(car) if(exist is None): return False else: return exist def insert_car(self,db,car): my_set = db.set(self.db_name,self.car_set_name) my_set.insert_one(car) brand = my_set.find_one(car) return brand
pipeline
from mininova.settings import settings import pymysql import os from mininova.db import Bookdb from mininova.mongo_novel import MongoNovel from mininova.mongo_car import MongoCar import copy class CarPipeline(object): def process_item(self,item,spider): mongo_car = MongoCar() mongo_car.insert(item) print(item["name"]) print("item insert ok!")
setting
mongo_setting = { "mongo_host" : "xxx.xxx.xxx.xxx", "mongo_port" : 27017, "mongo_user" : "username", "mongo_password" : "password" }
文章版權歸作者所有,未經允許請勿轉載,若此文章存在違規行為,您可以聯系管理員刪除。
轉載請注明本文地址:http://specialneedsforspecialkids.com/yun/43814.html
摘要:時間永遠都過得那么快,一晃從年注冊,到現在已經過去了年那些被我藏在收藏夾吃灰的文章,已經太多了,是時候把他們整理一下了。那是因為收藏夾太亂,橡皮擦給設置私密了,不收拾不好看呀。 ...
摘要:都說年末了,該給自己寫寫總結了。我現在做一些簡單的爬蟲都會用它。并且對數據的實時性要求較高,或者爬數據的時候封的太厲害了。對于這一類的爬蟲。消息隊列用于分發消息給某個爬蟲節點。爬蟲節點完成具體的爬蟲,格式化爬蟲數據。最后,感謝我的,謝謝 都說年末了,該給自己寫寫總結了。今天我想談一談的是我在公司這一年多里的負責的部分工作---爬蟲。做了這么久的爬蟲,是該寫點什么,留下點什么。在我所負責...
摘要:原文地址爬取汽車之家二手車產品庫項目地址目標最近經常有人在耳邊提起汽車之家,也好奇二手車在國內的價格是怎么樣的,因此本次的目標站點是汽車之家的二手車產品庫分析目標源一頁共條含分頁,但這個老產品庫,在頁后會存在問題,因此我們爬取頁可以獲取全 原文地址:爬取汽車之家 二手車產品庫項目地址:https://github.com/go-crawler... 目標 最近經常有人在耳邊提起汽車之家...
摘要:不過不用擔心,中有很多非常優秀的爬蟲框架,比如我們接下來要學習到的。結合以上分析我們基本確定了本次爬蟲的各個路線入口,接下來我們就開始通過程序來實現本次的目標。這里我們的目的是建立一種寫爬蟲的思路,而不在于怎么使用工具來爬數據。 概述 在上一篇文章《爬蟲學習之一個簡單的網絡爬蟲》中我們對爬蟲的概念有了一個初步的認識,并且通過Python的一些第三方庫很方便的提取了我們想要的內容,但是...
閱讀 2722·2021-11-22 13:54
閱讀 1063·2021-10-14 09:48
閱讀 2292·2021-09-08 09:35
閱讀 1550·2019-08-30 15:53
閱讀 1166·2019-08-30 13:14
閱讀 606·2019-08-30 13:09
閱讀 2521·2019-08-30 10:57
閱讀 3334·2019-08-29 13:18