分享用多线程下载百度大图图片源码

#!/usr/bin/env python2
# -*- coding:utf-8 -*-
from Baidu import getImageUrlList, search, nextPage, searchResult
from Downloader import downloadFromQueue
from FileHelper import getFilenameFromURL, addExtension, makedir
from Queue import Queue
from thread import start_new_thread
from  Config import Config
from NetworkPrepare import prepare
import os, sys

def baseURL():
  if Config.site == 'baidu':
    return search(Config.keyword, Config.addtional)
  if Config.site == 'jandan':
    return 'http://jandan.net/ooxx'

def main():
  # 开始准备
  prepare()
  while_n = 0 # 循环计数器
  imglist = []
  makedir(Config.directory)
  print 'Generate search url'
  URL = baseURL()
  # 下载 #############
  # 获取搜索结果数量并与_count比较取其较小值
  count = min(searchResult(URL), Config.count)
  # 没有搜索结果时退出
  if not count:
    print "No search result at current condition."
    sys.exit(1)
  # 获得指定数量的url, 存放于list  
  print 'Fetching page',
  while len(imglist) < count:
    print while_n,
    while_n += 1
    tmplist = getImageUrlList(URL)
    imglist = imglist + tmplist
    URL = nextPage(URL, len(tmplist))
  print '' # 换行
  count = len(imglist)
  print "There're %d files to download" % count
  # 将已有文件从imglist中去除
  imglist = [url for url in imglist
             if not getFilenameFromURL(url) in os.listdir(Config.directory)]
  print "There's %d files already downloaded." % (count - len(imglist))
  # 下载该list 
  print 'Fetching list of %d files' % len(imglist)
  queue = Queue()
  for url in imglist:
    queue.put(url)
  failure = []
  for i in range(Config.thre
3b8
ad_count):
    start_new_thread(downloadFromQueue, (
                                         queue, failure, Config.directory, Config.timeout))
  queue.join()
  print "%d failed to fetch." % len(failure)

def clean():
  # 清理
  # 1.添加后缀
  print 'Adding extension ...'
  for fname in os.listdir(Config.directory):
    addExtension(Config.directory + os.sep + fname, '.jpg')
  print 'done.'
  # 2.保存cookie
  Config.cj.save()

if __name__ == "__main__":
  main()
  clean()

#!/usr/bin/env python2
# -*- coding:utf-8 -*-
from Downloader import getStream
from MyParser import MyParser
from String import longestString, cutTo, cutBegin, getCodingContent
from urllib import urlencode
import json
import re

def getImageUrlFromScript(script):
  pattern = re.compile(r'(?<="objURL":").*?(?=")')
  groups = pattern.findall(script)
  new_group = [amatch.strip() for amatch in groups] # 更Pythonic的方式
  return new_group

def getImageUrlList(url):
  imglist = []
  for i in _getJsonList(url):
    imglist.append(i['objURL'].strip())
  return imglist

def _getJsonList(url):
  stream = getStream(url)
  data = getCodingContent(stream)
  pattern = re.compile(r'(?<=var imgdata =).*?(?=;v)')
  block = pattern.findall(data)[0]
  jsonlist = json.loads(block)
  return jsonlist['data'][:-1]

def nextPage(url, pn):
  url_pn = cutBegin(url, '&pn=')
  if not url_pn:
    url_pn = 0
  url_pn = int(url_pn) + pn
  return cutTo(url, '&pn') + '&pn=' + str(url_pn)

def search(keyword, addtionParams={}):
  """Generate a search url by the given keyword.
  params keyword: utf8 string"""
  url = 'http://image.baidu.com/i?'
  parser = MyParser()
  params = _getParams('http://image.baidu.com', parser)
  params.update(addtionParams)
  params.update({'word':keyword.decode('utf8').encode('gbk')})
  return url + urlencode(params)

def searchResult(url):
  parser = MyParser()
  parser.feed(getCodingContent(getStream(url)))
  block = longestString(parser.scriptList)
  parser.close()
  pattern = re.compile('(?<="listNum":)\d*(?=,)')
  count = pattern.findall(block)
  if count:
    count = int(count[0])
    return count
  return 0

def _getParams(url, parser):
  """Get a dict contained the url params"""
  stream = getStream(url)
  data = getCodingContent(stream)
  parser.feed(data)
  return parser.formParams

def _appendParams(adict):
  """Generate a url with params in adict."""
  p = [key + '=' + adict[key] for key in adict]
  return '&'.join(p)

#!/usr/bin/env python2
# -*- coding:utf-8 -*-
from cookielib import LWPCookieJar
class Config:
  keyword = '美女' # 要搜索的关键字 注意不要改变文件编码
  addtional = {'width':'1920', 'height':'1200'} # 宽度和高度 可以为空 {}
  directory = r'image'  # 存放的位置
  count = 30     # 要下载的数量,自动进到20的倍数
  thread_count = 15 # 线程数
  timeout = 20 # 下载超时限制 使用超时20 10好像小了点
  # 代理设置
  proxy = 'http://localhost:7001'
  use_proxy = False
  proxy_user = 'user_name'
  proxy_pass = 'password'
  proxy_auth = False
  cookies = 'cookies.txt'
  use_cookies = True
  cj = LWPCookieJar(cookies)
  site = 'baidu'  #site='jandan'

#!/usr/bin/env python2
# -*- coding:utf-8 -*-
from FileHelper import getFilenameFromURL, writeBinFile
import urllib2

def getStream(url, timeout=10):
  # 返回一个url流或者False
  request = urllib2.Request(url)
  request.add_header('User-Agent', UserAgent.Mozilla)
  try:
    stream = urllib2.urlopen(request, timeout=timeout)
  except (Exception, SystemExit): # catch SystemExit to keep running
    print "URL open error. Probably timed out."
    return False
  return stream

def downloadFromQueue(queue, failure, directory='.', timeout=10):
  """Get files from 
3c48
a list of urls.
  return : list, contained the failure fetch"""
  while not queue.empty():
    url = queue.get()
    stream = getStream(url, timeout=timeout)
    file_name = getFilenameFromURL(url)
    if stream and writeBinFile(stream, file_name, directory):
      queue.task_done()
      print "Fetching", url, 'done.'
      continue
    failure.append(url)
    queue.task_done()
  return failure

class UserAgent:
  Mozilla = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14'

#!/usr/bin/env python2
# -*- coding:utf-8 -*-
import re, os
def getFilenameFromURL(url):
  # 在 Downloader 中使用
  pos = url.rfind('/')
  shorted = url[pos + 1:]
  pattern = re.compile(r'\w*[\.\w]*')
  f_name = pattern.findall(shorted)[0]
  return f_name

def addExtension(fname, ext):
  # 在 App 中使用,添加扩展名
  # 没有后缀才添加
  if '.' not in fname:
    rename(fname, ext)
def rename(old, ext):
  # ext='.jpg'
  if os.path.isfile(old + ext):
    ext = '2' + ext
    rename(old, ext)
    return None
  print 'rename', old, old + ext
  os.rename(old, old + ext)

def makedir(directory):
  if not os.path.isdir(directory):
    os.mkdir(directory) # 不捕获_directory是文件时的异常,让程序自己退出

def writeBinFile(stream, file_name, directory='.', mode='wb'):
  """Read from the given url and write to file_name."""
  file_name = directory + os.sep + file_name
  if os.path.isfile(file_name):
    print 'File %s exist.' % file_name
    return False
  CHUNCK_SIZE = 1024
  with open(file_name, mode) as fp:
    while True:
      try:
        chunck = stream.read(CHUNCK_SIZE)
      except (Exception, SystemExit):
        print 'Fetching error. Probably timed out.'
        fp.close()
        os.remove(file_name)
        return False
      if not chunck:break
      fp.write(chunck)
  return True
#!/usr/bin/env python2
# -*- coding:utf-8 -*-
import HTMLParser

class MyParser(HTMLParser.HTMLParser):
  def __init__(self):
    HTMLParser.HTMLParser.__init__(self)
    self.toggle_script_parse = False
    self.toggle_form_parse = False
    self.scriptList = []
    self.formParams = {}
    self.result = 0

  def handle_starttag(self, tag, attrs):
    HTMLParser.HTMLParser.handle_starttag(self, tag, attrs)
    attrs = dict(attrs)
    if tag == 'script':
      self.toggle_script_parse = True
    # parse start parse form to get attrs in input tag
    if tag == 'form' and attrs.has_key('name') and attrs['name'] == 'f1':
      self.toggle_form_parse = True
    if tag == 'input' and self.toggle_form_parse:
      if attrs.has_key('type') and attrs['type'] == 'hidden':
        key = attrs['name'];value = attrs['value']
        self.formParams[key] = value

  def handle_endtag(self, tag):
    HTMLParser.HTMLParser.handle_endtag(self, tag)
    if tag == 'form' and self.toggle_form_parse:
      self.toggle_form_parse = False

  def handle_data(self, data):
    HTMLParser.HTMLParser.handle_data(self, data)
    if self.toggle_script_parse:
      self.scriptList.append(data)
      self.toggle_script_parse = False

  def reset(self):
    HTMLParser.HTMLParser.reset(self)
    self.toggle_script_parse = False
    self.toggle_form_parse = False
    self.scriptList = []
    self.formParams = {}
    self.result = 0

#!/usr/bin/env python2
# -*- coding:utf-8 -*-
import urllib2
from Config import Config

def proxy_handler(proxy, use_proxy, proxy_auth=False, puser='', ppass=''):
  if use_proxy:
    return urllib2.ProxyHandler({"http" : proxy})
  return urllib2.ProxyHandler({})

def cookie_handler(cj):
  try:
    cj.revert(cj)
  except Exception:
    pass
  cj.clear_expired_cookies()
  return urllib2.HTTPCookieProcessor(cj)

def prepare():
  ch = cookie_handler(Config.cj)
  ph = proxy_handler(Config.proxy, Config.use_proxy)
  if Config.proxy_auth:
    pm = urllib2.HTTPPasswordMgrWithDefaultRealm()
    pm.add_password(None, Config.proxy, Config.proxy_user, Config.proxy_pass)
    urllib2.install_opener(urllib2.build_opener(ch, ph, urllib2.ProxyBasicAuthHandler(pm)))
    return
  urllib2.install_opener(urllib2.build_opener(ch, ph))

#!/usr/bin/env python2
# -*- coding:utf-8 -*-
def determinCoding(content, header):
  """Determin a coding of a given url content and it's header.
  params headers : HTMLHeader instance"""
  content_type = header['Content-Type']
  tag = 'charset='
  if content_type:
    if tag in content_type:
      pos = content_type.index(tag)
      pos += 8
      return content_type[pos:]
  content = content.lower()
  if tag in content:
    startpos = content.index(tag)
    endpos = content[startpos:].index('"')
    return content[startpos:endpos][startpos + 8:]

def getCodingContent(stream):
  # 获取stream的编码
  """Return a string in which is the content of given url.
  return - content : unicode string"""
  content = stream.read()
  coding = determinCoding(content, stream.headers)
  stream.close()
  return content.decode(coding)

def longestString(alist):
  """Return the longest string of a list of strings."""
  a_new_list = [len(a_str) for a_str in alist]
  pos = a_new_list.index(max(a_new_list))
  return alist[pos]

def cutTo(str_1, str_2):
  """Cut str_1 to the position just befor str_2."""
  # 不包含 str_2
  if not str_2 in str_1 :
    return str_1
  pos = str_1.index(str_2)
  return str_1[0:pos]

def cutBegin(str_1, str_2):
  # 在MyParser中使用
  if not str_2 in str_1:
    return None
  pos = str_1.index(str_2) + len(str_2)
  return str_1[pos:]