Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地-侯体宗的博客

Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地
Python / 管理员发布于 7年前 196
本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码，供大家参考，具体内容如下
#!/user/bin/python # -*- coding: gbk -*- #Spider.py  import urllib2 import httplib import StringIO import gzip import re import chardet import sys import os import datetime from xml.dom.minidom import Document from BeautifulSoup import BeautifulSoup  ## 这段代码是用于解决控制台打印汉字报错的问题 reload(sys) sys.setdefaultencoding("utf8") #####################################################  ## debug模式开关，开启后可以看到Http请求的头部信息以及debug日志 DEBUG = 1 NO_DEBUG = 0 httplib.HTTPConnection.debuglevel = DEBUG ## 是否显示爬取网页源代码开关 showSrcCode = False ## 压缩方式 ZIP_TYPE = "gzip"  fileName = "auctions" location = "d://spiderData/"  ## header headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE} #####################################################   #############class SpiderConfig ##################### class SpiderConfig:  """   configuration for spider name and url  """  def __init__(self, name, url):   self.name = name   self.url = url #####################################################  ##############class SpiderAuctionDomain############## class SpiderAuctionDomain:  """   Store information with auctions spidered by python  """  title = ""  url = ""  img = ""  price = ""   def __init__(self):   pass  #####################################################  ########class SpiderDefaultErrorHandler############## class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):  def http_error_default(self, req, fp, code, msg, hdrs):   """    default error process handler for spider   """   result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp)   result.status = code   result.url = req.get_full_url()    print "<", result.url, "Exception code :", result.status, ">"    return result #####################################################  #############class SpiderHandler##################### class SpiderHandler:  """   spider handler  """   def spider(self, spiderConfig):   try:    request = urllib2.Request(spiderConfig.url)     ## configure request hreader    for key,val in headerConfig.items():     request.add_header(key, val)     ## build opener    opener = urllib2.build_opener(SpiderDefaultErrorHandler())     ## open request    openRequest = opener.open(request)     ## read data    spiderData = openRequest.read()     ## close    opener.close()     if 0 == len(spiderData):     return     if ZIP_TYPE== openRequest.headers.get("Content-Encoding"):     spiderData = SpiderHandler.gzipData(self, spiderData)     if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode:     print spiderData     # parse html    SpiderHandler.parse(self, spiderData)    except Exception,x:    print "spider process Exception:", x     def parse(self, spiderData):   """    parse html content   """    if httplib.HTTPConnection.debuglevel == DEBUG:    charsetAnalyze = chardet.detect(spiderData)    print "analyze spider data encode :",charsetAnalyze["encoding"]    print "执行解析", fileName    soup = BeautifulSoup(spiderData)   encode = soup.originalEncoding    encoding = lambda x : x.encode(encode)    if httplib.HTTPConnection.debuglevel == DEBUG:    print "识别到编码：", encode    title = soup.head.title.string    print encoding(title)    spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"})   auctions = ["%s" % s for s in spiderContents]    if auctions is None:    return    auctionList = []    for auc in auctions:    auctionDomain = SpiderAuctionDomain()    # parse auction link    links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc)    if links is not None :     auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0]))     #parse auction title    titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc)    if titles is not None:     auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0]))     #parse auction price    price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc)    if price is not None:     auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0])     #parse image url    imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc)    if imgs is not None:     auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0])     auctionList.append(auctionDomain)    print "成功解析商品信息："   for a in auctionList:    print "--->",a.title    # sort auction list   auctionList = SpiderHandler.sortAuctionList(self, auctionList)    # save in file   SpiderHandler.save(self, auctionList)    print "解析完成"    pass   def sortAuctionList(self, auctionList):   """    冒泡排序，按照价格排序   """   length = len(auctionList)   if length < 2:    return auctionList   else:    for i in range(length-1):     for j in range(length - i -1):      if float(auctionList[j].price) > float(auctionList[j+1].price):       auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j]   return auctionList   pass   def save(self, auctionList):   if auctionList is not None:    doc = Document()     auctions = doc.createElement("auctions")    doc.appendChild(auctions)     for auc in auctionList:     auction = doc.createElement("auction")     auctions.appendChild(auction)      SpiderHandler.generateXML(self, doc, auction, "title", auc.title)     SpiderHandler.generateXML(self, doc, auction, "price", auc.price)     SpiderHandler.generateXML(self, doc, auction, "img", auc.img)     SpiderHandler.generateXML(self, doc, auction, "link", auc.link)     if False == os.path.exists(location):     os.mkdir(location)     file = open(location+fileName+".xml", 'w')    file.write(doc.toprettyxml())    file.close()     if httplib.HTTPConnection.debuglevel == DEBUG:     print doc.toprettyxml()   def generateXML(self, doc, f, name, txt):   c = doc.createElement(name)   f.appendChild(c)   c.appendChild(doc.createTextNode(txt))   def gzipData(self, spiderData):   """    get data from gzip   """   if 0 == len(spiderData):    return spiderData   spiderDataStream = StringIO.StringIO(spiderData)   spiderData = gzip.GzipFile(fileobj=spiderDataStream).read()   return spiderData #####################################################  if __name__ == "__main__":  nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒")   needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou",       "hangzhou":"http://ju.taobao.com/hangzhou",       "shanghai":"http://ju.taobao.com/shanghai",       "beijing":"http://ju.taobao.com/beijing",       "chengdu":"http://ju.taobao.com/chengdu"}   configList = []  for k,v in needSpiderUrl.items():   spiderConfig = SpiderConfig(k, v)   configList.append(spiderConfig)   spiderHandler = SpiderHandler()   print "爬虫执行开始时间：",nowtime()  for spiderConfig in configList:   fileName = spiderConfig.name   spiderHandler.spider(spiderConfig)   print "爬虫执行完毕时间：",nowtime()
更多内容请参考专题《python爬取功能汇总》进行学习。
以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持。
上一条：
Python爬豆瓣电影实例
下一条：
Python各类图像库的图片读写方式总结(推荐)
0条评论 (评论内容有缓存机制,请悉知!)