Python实现的文轩网爬虫完整示例-侯体宗的博客

Python实现的文轩网爬虫完整示例
Python / 管理员发布于 8年前 268

本文实例讲述了Python实现的文轩网爬虫。分享给大家供大家参考，具体如下：

encoding=utf8import pymysqlimport timeimport sysimport requestsimport os#捕获错误import tracebackimport types#将html实体化import cgiimport warningsreload(sys)sys.setdefaultencoding('utf-8')from pyquery import PyQuery as pqfrom lxml import etreesys.setdefaultencoding('utf-8')#屏蔽错误warnings.filterwarnings("ignore")#下载图片def dowloadPic(imageUrl,filePath):r = requests.get(imageUrl,timeout=60)status=r.status_codeif status == 404:return 404with open(filePath, "wb") as code:code.write(r.content)#根据详情页地址抓取数据并插入数据库def getData(final_url):file_open=open('./url.txt', 'w')file_open.write(final_url)file_open.close()#链接数据库conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='bookinfo', charset='utf8')#设置浮标cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)#解析详情页面try:detail_url=final_urlc=pq(detail_url)head=c('html').attr('xmlns')err='http://www.w3.org/1999/xhtml'err1='http://www.winxuan.com/cms/2016db_sh'if head == err or head == err1:return 'back'except Exception, e:return 'back'i=0while i<12:  text = c('#page').find('.cont').find('li').eq(i).text()  text=text.replace('　','')  if 'I S B N' in text:    isbn=text.replace('I S B N：','')    isbn=isbn.strip()    sel='select count(*) from bi_book where isbn ='+isbn    cursor.execute(sel)    result=cursor.fetchone()    count=result['count(*)']    if count != 0 :      print u'已存在'      return 'back'  if 'isbn：' in text :    isbn=text.replace('isbn：','')    isbn=isbn.strip()    sel='select count(*) from bi_book where isbn ='+isbn    cursor.execute(sel)    result=cursor.fetchone()    count=result['count(*)']    if count != 0 :      print u'已存在'      return 'back'  if '作者：' in text :    author = text.replace('作者：','')  if '出版社：' in text :    press_name=text.replace('出版社：','')  if '版次：' in text :    edition=text.replace('版次：','')  if '印次：' in text :    impressions=text.replace('印次：','')  if '装帧：' in text :    packaging=text.replace('装帧：','')  if '开本：' in text:    size=text.replace('开本：','')  if '出版时间：' in text:    press_time=text.replace('出版时间：','')    press_time=press_time.strip()    if press_time == '无':      press_time='1970-01-01'  if '印刷时间：' in text:    print_time=text.replace('印刷时间：','')    print_time=print_time.strip()    if print_time== '无':      print_time='1970-01-01'  if '页数：' in text:    page_num=text.replace('页数：','')  if '字数：' in text:    word_num=text.replace('字数：','')  i+=1if ('author' in locals().keys()) == False:  author = ''if ('press_time' in locals().keys()) == False:  press_time = '1970-01-01'if ('print_time' in locals().keys()) == False:  print_time = '1970-01-01'if ('impressions' in locals().keys()) == False:  impressions = ''if ('edition' in locals().keys())== False:  edition = ''if ('page_num' in locals().keys())== False:  page_num = ''if ('word_num' in locals().keys())== False:  word_num = ''if ('packaging' in locals().keys())== False:  packaging = ''if ('size' in locals().keys())== False:  size = ''if ('press_name' in locals().keys())== False:  press_name = ''#暂无图片地址none_img='http://static.winxuancdn.com/goods/sml_blank.jpg'#获取大小图地址big_path=c('.info-side').find('.img').find('a').find('img').attr('src')if big_path is None:  return 'back'elif big_path == none_img :  big_path=''  small_path=''else :  small_path=big_path.replace('_16','_11')#获取分类#先获取a标签htmlahtml=c('#page').find('.base-nav').eq(0).html()#解析a标签htmlcate=pq(ahtml)#获取分类的最后一个分类category=cate('a:last').text()#获取书名name=c('.info-main').find('.name').eq(0).find('h1').eq(0).text()name=name.strip()#获取价格price=c('.info-main').find('.attr').eq(0).find('.price-n').eq(0).find('b').text()price=price.replace('¥','')#循环获取内容简介和目录信息k=5while k<12:  title=c('#page').find('.title').eq(k).find('.tab').find('h4').text()  if '内容简介' in title:    con=c('#page').find('.title').eq(k).nextAll()    det=pq(con)    content=det('.text-words-1').html()    content=content.encode("utf8", "ignore");  if '目录' in title:    con=c('#page').find('.title').eq(k).nextAll()    dry=pq(con)    directory=dry('.text-words-1').html()    directory=directory.encode("utf8", "ignore");  k+=1#如果内容简介和目录没有的时候指定为空字符串if ('content' in locals().keys())== False:  content = ''if ('directory' in locals().keys())== False:  directory = ''details  = '内容简介<br>'+content+'<br><br>目录<br>'+directorydetails=cgi.escape(details)#录入时间add_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))#下载小图#文件根目录root_path=sys.path[0]#创建isbn文件夹路径root_path=root_path.replace('\\','/')isbn_path=root_path+'/download/'+isbnif big_path != '' and small_path !='' :  #创建isbn目录  if os.path.isdir(isbn_path) ==False :    os.mkdir(isbn_path)    #组合下载后图片保存路径    down_img_small = isbn_path+"/small"+isbn+".jpg"    down_img_big  = isbn_path+'/big'+isbn+".jpg"    #调用下载图片方法    small_res=dowloadPic(small_path,down_img_small)    #大图保存数据库路径    big_res=dowloadPic(big_path,down_img_big)    #小图保存数据库路径    if small_res==404 :      img_small = 'none-picture/none-small.jpg'    else :      img_small = 'download/'+isbn+'/small'+isbn+'.jpg'    if big_res==404 :      img_big = 'none-picture/none-big.jpg'    else :      img_big  = 'download/'+isbn+'/big'+isbn+'.jpg'  else :    #组合保存数据库中的图片路径    img_small = 'download/'+isbn+'/small'+isbn+'.jpg'    img_big  = 'download/'+isbn+'/big'+isbn+'.jpg'else :  img_big = 'none-picture/none-big.jpg'  img_small = 'none-picture/none-small.jpg'source_type = 3try :  #要插入的列表  li=[0,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small]  #执行sql  sql="insert into bi_book (book_id,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"  aaa=cursor.execute(sql,li)  if aaa==1:    print u'插入成功'  conn.commit()except Exception, e :  return 'back'def winxuan(n):#首页解析home_url='http://www.winxuan.com/'h=pq(home_url)#分类导航链接menu=h('.mod-mainmenu').find('dd').find('a').eq(n).attr('href')#print menu#分类书籍首页try:mh=pq(menu)except Exception, e :return 'backs'# text=mh('.main').find('a').text()# text=text.encode("GBK", "ignore");li=[]u=0while u<248 :detail_urls=mh('.main').find('a').eq(u).attr('href')#将取到所有地址放入到列表当中li.append(detail_urls)u+=1#进行列表去重li=list(set(li))for final_url in li:try:result=getData(final_url)except Exception, e :continueif result=='back' :continueprint 'OK,finished'n=0while n<58:while n<58:print nstring=str(n)file_open=open('./number.txt', 'w')file_open.write(string)file_open.close()res=winxuan(n)n+=1if res=='backs' :continue

更多关于Python相关内容可查看本站专题：《Python Socket编程技巧总结》、《Python正则表达式用法总结》、《Python数据结构与算法教程》、《Python函数使用技巧总结》、《Python字符串操作技巧汇总》、《Python入门与进阶经典教程》及《Python文件与目录操作技巧汇总》

希望本文所述对大家Python程序设计有所帮助。

上一条：
Python实现爬取亚马逊数据并打印出Excel文件操作示例
下一条：
计算机二级python学习教程（2） python语言基本语法元素

0条评论 (评论内容有缓存机制,请悉知!)