侯体宗的博客
  • 首页
  • 人生(杂谈)
  • 技术
  • 关于我
  • 更多分类
    • 文件下载
    • 文字修仙
    • 中国象棋ai
    • 群聊
    • 九宫格抽奖
    • 拼图
    • 消消乐
    • 相册

python爬取本站电子书信息并入库的实现代码

Python  /  管理员 发布于 7年前   208

入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库

数据库工具类:DBUtil.py

import pymysqlclass DBUtils(object):  def connDB(self):   #连接数据库    conn=pymysql.connect(host='192.168.251.114',port=3306, user='root',passwd='b6f3g2',db='yangsj',charset='utf8');    cur=conn.cursor();    return (conn,cur);  def exeUpdate(self,conn,cur,sql):        #更新或插入操作    sta=cur.execute(sql);    conn.commit();    return (sta);  def exeDelete(self,conn,cur,IDs):        #删除操作 demo 没用到    sta=0;    for eachID in IDs.split(' '):      sta+=cur.execute("delete from students where Id=%d"%(int(eachID)));    conn.commit();    return (sta);  def exeQuery(self,cur,sql):           #查找操作    effect_row = cur.execute(sql);    return (effect_row,cur);  def connClose(self,conn,cur):          #关闭连接,释放资源    cur.close();    conn.close();if __name__ == '__main__':  dbUtil = DBUtils();  conn,cur = dbUtil.connDB();

书籍操作文件 bookOpe.py

from DBUtil import DBUtilsfrom bookInfo import Bookfrom bookInfo import DownLoadInfoimport logginglogging.basicConfig(  level=logging.INFO)class BookOperator(object):  def __addBook(self,book):    logging.info("add book:%s" % book.bookName);    dbUtil = DBUtils();    conn,cur = dbUtil.connDB();    insertBookSql = ("insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo));    dbUtil.exeUpdate(conn,cur,insertBookSql);    dbUtil.connClose(conn,cur);  def __selectLastBookId(self):    logging.info("selectLastBookId ");    dbUtil = DBUtils();    conn,cur = dbUtil.connDB();    selectLastBookSql = "select id from book order by id desc limit 1";    effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);    bookId = cur.fetchone()[0];    dbUtil.connClose(conn,cur);    return bookId;  def __addBookDownLoadInfos(self,downLoadInfos,bookId):    logging.info("add bookId:%s" % bookId);    dbUtil = DBUtils();    conn,cur = dbUtil.connDB();    for downLoadinfo in downLoadInfos:      insertBookDownLoadInfo = ("insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl));      dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);    dbUtil.connClose(conn,cur);  def addBookInfo(self,book):    logging.info("add bookInfo:%s" % book.bookName);    self.__addBook(book);    bookId = self.__selectLastBookId();    self.__addBookDownLoadInfos(book.downLoadInfos,bookId);if __name__ == '__main__':  bookope = BookOperator();  book = Book("aaa","yang","cccc");  book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍"));  bookope.addBookInfo(book);

书籍信息文件 bookInfo.py

import syssys.encoding = "utf8"class Book(object):  #书籍信息#  def __init__(self,mainInfo,downLoadUrl,bookName):    self.mainInfo = mainInfo;    self.downLoadUrl = downLoadUrl;    self.bookName = bookName;    self.downLoadInfos = [];  def addDownLoadUrl(self,downloadInfo):    self.downLoadInfos.append(downloadInfo);  def print_book_info(self):    print ("bookName :%s" % (self.bookName));class DownLoadInfo(object):  #下载信息#  def __init__(self,downUrl,downName):    self.downUrl = downUrl;    self.downName = downName;  def print_down_info(self):    print ("downLoad %s - %s" % (self.downUrl,self.downName));

51job界面解析文件 FiveOneJobFetch.py

import requestsfrom bs4 import BeautifulSoupimport sysfrom bookInfo import Bookfrom bookInfo import DownLoadInfoimport loggingsys.encoding = "utf8"class PageFetch(object):  host = "///";  #域名+分类  category = "books/"; #具体请求页  def __init__(self,pageUrl):    self.pageUrl = pageUrl; #完整URL    self.url = PageFetch.host+PageFetch.category + pageUrl;  def __getPageContent(self):    req = requests.get(self.url);    if req.status_code == 200:      req.encoding = "gb2312";      strText = req.text;      return strText;    else:      return "";  def getPageContent(url):    req = requests.get(url);    if req.status_code == 200:      req.encoding = "gb2312";      strText = req.text;      return strText;    else:      return "";  def __getMaxPageNumAndUrl(self):    fetchUrl = self.pageUrl;    #获取分页地址 分页url 形如 list45_2.html 2为页号#    maxPageNum = 0;    maxLink = "";    while maxLink == "":      url = PageFetch.host+PageFetch.category +fetchUrl;      reqContent = PageFetch.getPageContent(url)      soup = BeautifulSoup (reqContent,"html.parser");      for ul in soup.select(".plist"):        print ("数据");        print (ul);        maxPageNum = ul.select("strong")[0].text;        alink = ul.select("a");        if alink[-1]['href'] == "#":          maxLink = alink[1]['href'];        else:          fetchUrl = alink[-1]['href'];    return maxPageNum,maxLink;  def __formatPage(self,pageNum):    #格式化url 形如 list45_2.html#    lineBeginSite = self.pageUrl.index("_")+1;    docBeginSite = self.pageUrl.index(".");    return self.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:];  def getBookPageList(self):    #获取书籍每页的URL#    shortPageList = [];    maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();    for i in range(int(maxPageNum)):      shortPageList.append(self.host +self.category+ self.__formatPage(i));    return shortPageList;  def getDownloadPage(url):    downPage= [];    reqContent = PageFetch.getPageContent(url);    soup = BeautifulSoup (reqContent,"html.parser");    for a in soup.select(".cur-cat-list .btn-dl"):      downPage.append(PageFetch.host+a['href']);    return downPage;  def getBookInfo(url):    logging.info("获取书籍信息url:%s" % url);    reqContent = PageFetch.getPageContent(url);    soup = BeautifulSoup (reqContent,"html.parser");    mainInfo = (soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'","");    title = (soup.select("dl dt h1"))[0].text.replace("'","");    book = Book(mainInfo,url,title);    for ul in soup.select(".ul_Address"):      for li in ul.select("li"):        downLoadInfo = DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text);        book.addDownLoadUrl(downLoadInfo);    return book;if __name__ == '__main__':  p = PageFetch("list152_1.html");  shortPageList = p.getBookPageList();  downPage= [];  for page in shortPageList:    downLoadPage = PageFetch.getDownloadPage(page);    downPage = downPage+downLoadPage;  print ("================汇总如下===============================");  for bookDownLoadPage in downPage:    book = PageFetch.getBookInfo(bookDownLoadPage);    print (book.bookName+":%s" % book.downLoadUrl);    for d in book.downLoadInfos:      print ("%s - %s" % (d.downUrl,d.downName));  # p = PageFetch("list977_1.html");  # p = p.getMaxPageNumAndUrl();  # print (p);

执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py

from FiveOneJobFetch import PageFetchfrom bookInfo import Bookfrom bookInfo import DownLoadInfofrom bookOpe import BookOperatordef main(url):  p = PageFetch(url);  shortPageList = p.getBookPageList();  bookOperator = BookOperator();  downPage= [];  for page in shortPageList:    downLoadPage = PageFetch.getDownloadPage(page);    downPage = downPage+downLoadPage;  for bookDownLoadPage in downPage:    book = PageFetch.getBookInfo(bookDownLoadPage);    bookOperator.addBookInfo(book);  print ("数据抓取成功:"+url);if __name__ == '__main__':  urls = ["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"];  for url in urls:    main(url);

数据库表:书籍信息表和下载地址表

CREATE TABLE `book` (`id` INT(11) NOT NULL AUTO_INCREMENT,`bookName` VARCHAR(200) NULL DEFAULT NULL,`bookUrl` VARCHAR(500) NULL DEFAULT NULL,`bookInfo` TEXT NULL,PRIMARY KEY (`id`))COLLATE='utf8mb4_general_ci'ENGINE=InnoDBAUTO_INCREMENT=2936;
CREATE TABLE `book_down_url` (`id` INT(11) NOT NULL AUTO_INCREMENT,`bookId` INT(11) NOT NULL DEFAULT '0',`downName` VARCHAR(200) NOT NULL DEFAULT '0',`downUrl` VARCHAR(2000) NOT NULL DEFAULT '0',PRIMARY KEY (`id`))COLLATE='utf8mb4_general_ci'ENGINE=InnoDBAUTO_INCREMENT=44441;

git地址:https://git.oschina.net/yangsj/BookFetch/tree/master


  • 上一条:
    Python实现FLV视频拼接功能
    下一条:
    python机器学习库xgboost的使用
  • 昵称:

    邮箱:

    0条评论 (评论内容有缓存机制,请悉知!)
    最新最热
    • 分类目录
    • 人生(杂谈)
    • 技术
    • linux
    • Java
    • php
    • 框架(架构)
    • 前端
    • ThinkPHP
    • 数据库
    • 微信(小程序)
    • Laravel
    • Redis
    • Docker
    • Go
    • swoole
    • Windows
    • Python
    • 苹果(mac/ios)
    • 相关文章
    • 在python语言中Flask框架的学习及简单功能示例(0个评论)
    • 在Python语言中实现GUI全屏倒计时代码示例(0个评论)
    • Python + zipfile库实现zip文件解压自动化脚本示例(0个评论)
    • python爬虫BeautifulSoup快速抓取网站图片(1个评论)
    • vscode 配置 python3开发环境的方法(0个评论)
    • 近期文章
    • 在go语言中实现字符串可逆性压缩及解压缩功能(0个评论)
    • 使用go + gin + jwt + qrcode实现网站生成登录二维码在app中扫码登录功能(0个评论)
    • 在windows10中升级go版本至1.24后LiteIDE的Ctrl+左击无法跳转问题解决方案(0个评论)
    • 智能合约Solidity学习CryptoZombie第四课:僵尸作战系统(0个评论)
    • 智能合约Solidity学习CryptoZombie第三课:组建僵尸军队(高级Solidity理论)(0个评论)
    • 智能合约Solidity学习CryptoZombie第二课:让你的僵尸猎食(0个评论)
    • 智能合约Solidity学习CryptoZombie第一课:生成一只你的僵尸(0个评论)
    • 在go中实现一个常用的先进先出的缓存淘汰算法示例代码(0个评论)
    • 在go+gin中使用"github.com/skip2/go-qrcode"实现url转二维码功能(0个评论)
    • 在go语言中使用api.geonames.org接口实现根据国际邮政编码获取地址信息功能(1个评论)
    • 近期评论
    • 122 在

      学历:一种延缓就业设计,生活需求下的权衡之选中评论 工作几年后,报名考研了,到现在还没认真学习备考,迷茫中。作为一名北漂互联网打工人..
    • 123 在

      Clash for Windows作者删库跑路了,github已404中评论 按理说只要你在国内,所有的流量进出都在监控范围内,不管你怎么隐藏也没用,想搞你分..
    • 原梓番博客 在

      在Laravel框架中使用模型Model分表最简单的方法中评论 好久好久都没看友情链接申请了,今天刚看,已经添加。..
    • 博主 在

      佛跳墙vpn软件不会用?上不了网?佛跳墙vpn常见问题以及解决办法中评论 @1111老铁这个不行了,可以看看近期评论的其他文章..
    • 1111 在

      佛跳墙vpn软件不会用?上不了网?佛跳墙vpn常见问题以及解决办法中评论 网站不能打开,博主百忙中能否发个APP下载链接,佛跳墙或极光..
    • 2016-10
    • 2016-11
    • 2018-04
    • 2020-03
    • 2020-04
    • 2020-05
    • 2020-06
    • 2022-01
    • 2023-07
    • 2023-10
    Top

    Copyright·© 2019 侯体宗版权所有· 粤ICP备20027696号 PHP交流群

    侯体宗的博客