Python爬取视频(其实是一篇福利)过程解析-侯体宗的博客

Python爬取视频(其实是一篇福利)过程解析
Python / 管理员发布于 7年前 179

窗外下着小雨，作为单身程序员的我逛着逛着发现一篇好东西，来自知乎你都用 Python 来做什么？的第一个高亮答案。

到上面去看了看，地址都是明文的，得，赶紧开始吧。

下载流式文件，requests库中请求的stream设为True就可以啦，文档在此。

先找一个视频地址试验一下：

# -*- coding: utf-8 -*-import requests def download_file(url, path):  with requests.get(url, stream=True) as r:    chunk_size = 1024    content_size = int(r.headers['content-length'])    print '下载开始'    with open(path, "wb") as f:      for chunk in r.iter_content(chunk_size=chunk_size):        f.write(chunk) if __name__ == '__main__':  url = '就在原帖...'  path = '想存哪都行'  download_file(url, path)

遭遇当头一棒：

AttributeError: __exit__

这文档也会骗人的么！

看样子是没有实现上下文需要的__exit__方法。既然只是为了保证要让r最后close以释放连接池，那就使用contextlib的closing特性好了：

# -*- coding: utf-8 -*-import requestsfrom contextlib import closing def download_file(url, path):  with closing(requests.get(url, stream=True)) as r:    chunk_size = 1024    content_size = int(r.headers['content-length'])    print '下载开始'    with open(path, "wb") as f:      for chunk in r.iter_content(chunk_size=chunk_size):        f.write(chunk)

程序正常运行了，不过我盯着这文件，怎么大小不见变啊，到底是完成了多少了呢？还是要让下好的内容及时存进硬盘，还能省点内存是不是：

# -*- coding: utf-8 -*-import requestsfrom contextlib import closingimport os def download_file(url, path):  with closing(requests.get(url, stream=True)) as r:    chunk_size = 1024    content_size = int(r.headers['content-length'])    print '下载开始'    with open(path, "wb") as f:      for chunk in r.iter_content(chunk_size=chunk_size):        f.write(chunk)        f.flush()        os.fsync(f.fileno())

文件以肉眼可见的速度在增大，真心疼我的硬盘，还是最后一次写入硬盘吧，程序中记个数就好了：

def download_file(url, path):  with closing(requests.get(url, stream=True)) as r:    chunk_size = 1024    content_size = int(r.headers['content-length'])    print '下载开始'    with open(path, "wb") as f:      n = 1      for chunk in r.iter_content(chunk_size=chunk_size):        loaded = n*1024.0/content_size        f.write(chunk)        print '已下载{0:%}'.format(loaded)        n += 1

结果就很直观了：

已下载2.579129%已下载2.581255%已下载2.583382%已下载2.585508%

心怀远大理想的我怎么会只满足于这一个呢，写个类一起使用吧：

# -*- coding: utf-8 -*-import requestsfrom contextlib import closingimport time def download_file(url, path):  with closing(requests.get(url, stream=True)) as r:    chunk_size = 1024*10    content_size = int(r.headers['content-length'])    print '下载开始'    with open(path, "wb") as f:      p = ProgressData(size = content_size, unit='Kb', block=chunk_size)      for chunk in r.iter_content(chunk_size=chunk_size):        f.write(chunk)        p.output()  class ProgressData(object):   def __init__(self, block,size, unit, file_name='', ):    self.file_name = file_name    self.block = block/1000.0    self.size = size/1000.0    self.unit = unit    self.count = 0    self.start = time.time()  def output(self):    self.end = time.time()    self.count += 1    speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0    self.start = time.time()    loaded = self.count*self.block    progress = round(loaded/self.size, 4)    if loaded >= self.size:      print u'%s下载完成\r\n'%self.file_name    else:      print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s'.\         format(self.file_name, loaded, self.unit,\         self.size, self.unit, progress, speed, self.unit)      print '%50s'%('/'*int((1-progress)*50))

运行：

下载开始下载进度10.24Kb/120174.05Kb 0.01% 下载速度4.75Kb/s/////////////////////////////////////////////////下载进度20.48Kb/120174.05Kb 0.02% 下载速度32.93Kb/s/////////////////////////////////////////////////

看上去舒服多了。

下面要做的就是多线程同时下载了，主线程生产url放入队列，下载线程获取url：

# -*- coding: utf-8 -*-import requestsfrom contextlib import closingimport timeimport Queueimport hashlibimport threadingimport os def download_file(url, path):  with closing(requests.get(url, stream=True)) as r:    chunk_size = 1024*10    content_size = int(r.headers['content-length'])    if os.path.exists(path) and os.path.getsize(path)>=content_size:      print '已下载'      return    print '下载开始'    with open(path, "wb") as f:      p = ProgressData(size = content_size, unit='Kb', block=chunk_size, file_name=path)      for chunk in r.iter_content(chunk_size=chunk_size):        f.write(chunk)        p.output() class ProgressData(object):   def __init__(self, block,size, unit, file_name='', ):    self.file_name = file_name    self.block = block/1000.0    self.size = size/1000.0    self.unit = unit    self.count = 0    self.start = time.time()  def output(self):    self.end = time.time()    self.count += 1    speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0    self.start = time.time()    loaded = self.count*self.block    progress = round(loaded/self.size, 4)    if loaded >= self.size:      print u'%s下载完成\r\n'%self.file_name    else:      print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} {5:.2%} 下载速度{6:.2f}{7}/s'.\         format(self.file_name, loaded, self.unit,\         self.size, self.unit, progress, speed, self.unit)      print '%50s'%('/'*int((1-progress)*50)) queue = Queue.Queue() def run():  while True:    url = queue.get(timeout=100)    if url is None:      print u'全下完啦'      break    h = hashlib.md5()    h.update(url)    name = h.hexdigest()    path = 'e:/download/' + name + '.mp4'    download_file(url, path) def get_url():  queue.put(None)if __name__ == '__main__':  get_url()  for i in xrange(4):    t = threading.Thread(target=run)    t.daemon = True    t.start()

加了重复下载的判断，至于怎么源源不断的生产url，诸位摸索吧，保重身体！

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持。

上一条：
用Cython加速Python到起飞(推荐)
下一条：
基于python 微信小程序之获取已存在模板消息列表

0条评论 (评论内容有缓存机制,请悉知!)

最新最热

近期文章
在go语言中实现字符串可逆性压缩及解压缩功能(0个评论)
使用go + gin + jwt + qrcode实现网站生成登录二维码在app中扫码登录功能(0个评论)
在windows10中升级go版本至1.24后LiteIDE的Ctrl+左击无法跳转问题解决方案(0个评论)
智能合约Solidity学习CryptoZombie第四课:僵尸作战系统(0个评论)
智能合约Solidity学习CryptoZombie第三课:组建僵尸军队(高级Solidity理论)(0个评论)
智能合约Solidity学习CryptoZombie第二课:让你的僵尸猎食(0个评论)
智能合约Solidity学习CryptoZombie第一课:生成一只你的僵尸(0个评论)
在go中实现一个常用的先进先出的缓存淘汰算法示例代码(0个评论)
在go+gin中使用"github.com/skip2/go-qrcode"实现url转二维码功能(0个评论)
在go语言中使用api.geonames.org接口实现根据国际邮政编码获取地址信息功能(1个评论)

近期评论
122 在
学历：一种延缓就业设计，生活需求下的权衡之选中评论工作几年后，报名考研了，到现在还没认真学习备考，迷茫中。作为一名北漂互联网打工人..
123 在
Clash for Windows作者删库跑路了，github已404中评论按理说只要你在国内，所有的流量进出都在监控范围内，不管你怎么隐藏也没用，想搞你分..
原梓番博客在
在Laravel框架中使用模型Model分表最简单的方法中评论好久好久都没看友情链接申请了，今天刚看，已经添加。..
博主在
佛跳墙vpn软件不会用?上不了网?佛跳墙vpn常见问题以及解决办法中评论 @1111老铁这个不行了，可以看看近期评论的其他文章..
1111 在
佛跳墙vpn软件不会用?上不了网?佛跳墙vpn常见问题以及解决办法中评论网站不能打开，博主百忙中能否发个APP下载链接，佛跳墙或极光..

Top