从糗事百科下载数据的python方法示例

#encoding:utf-8

import sqlite3,urllib2,re,glob

_database='db.sqlite3'
_cn=sqlite3.connect(_database)
c=_cn.cursor()

findById='select * from qiu where id=?'
findByStatus='select * from qiu where status=?'
findContentNotEmpty='select * from qiu where content<>""'
deleteById='delete * from qiu where id=?'
insertId='insert into qiu(id) values(?)'
insert='insert into qiu(id,content,status) values(?,?,?)'
updateContentAndStatusById='update qiu set content=?,status=? where id=?'


def getRowCount(c):
  row=c.fetchall()
  return(len(row))

class qparser():

  def __init__(self,id):
    self.page_id=id
    self.url=self.getUrl(self.page_id)
    self.getPage()
    p=self.pageParser()
    if p[0]:
      self.updateDB(p)

  def getUrl(self,i):
    return("http://www.qiushibaike.com/articles/"+str(i)+".htm")

  def getPage(self):
    try:
      print('downloading '+self.url+'...')
      content=urllib2.urlopen(self.url).read()
      #print content
      print('download '+self.url+' finished')
      self.content=content
    except:
      self.content=''
      print('download '+self.url+' error')

#www.iplaypy.com

  def getContent(self,page):
    try:
      begin=page.find(r'<div class="content"')+25
      page=page[begin:]
      end=page.find(r'<div')-4
      page=page[:end]
      page.replace(r'<br />','').replace('\n','')
      return(page)
    except:
      return('')

  def pageParser(self):
    page=self.content
    print('parsing the page')
    result=[None]*3
    recontent=r'<div class=\"content\">\s+(.*)\s+'
    relast=r'<a href=\"\/articles\/(.*)\.htm\">&lt;<span class=\'ad\'> <\/span>上一糗事<\/a>'
    renext=r'<a href=\"\/articles\/(.*)\.htm\">下一糗事<span class=\'ad\'>'

    p=page
    result[0]=self.getContent(p)

    matches=re.findall(relast,page)
    if len(matches)>0:
        result[1]=matches[0]

    matches=re.findall(renext,page)
    if len(matches)>0:
        result[2]=matches[0]

    print('parsed the page')

    return(result)

  def updateDB(self,p): 
    content=p[0]
    last=p[1]
    next=p[2]

    if last:
      c.execute(findById,(last,))
      l=getRowCount(c)
      if l==1:
        c.close()
      elif l>1:
        c.close()
        c.execute(deleteById,(last,))
        c.execute(insertId,(last,))
        _cn.commit
      else:
        c.close()
        c.execute(insertId,(last,))
        _cn.commit

    if next:
      c.execute(findById,(next,))
      l=getRowCount(c)
      if l==1:
        c.close()
      elif l>1:
        c.close()
        c.execute(deleteById,(next,))
        c.execute(insertId,(next,))
        _cn.commit()
      else:
        c.close()
        c.execute(insertId,(next,))
        _cn.commit()

    if last and next:
      c.execute(findById,(self.page_id,))
      l=getRowCount(c)
      if l>1:
        c.close()
        c.execute(deleteById,(self.page_idid,))
        c.execute(insert,(self.page_id,content,1))
        _cn.commit()
      else:
        c.close()
        c.execute(updateContentAndStatusById,(content,1,self.page_id))
        _cn.commit()
    else:
      c.execute(updateContentAndStatusById,(content,1,self.page_id))
      _cn.commit()

class downloader():

  def __init__(self):
    idList=self.getIdList()
    while len(idList)!=0:
      for i in idList:
        q=qparser(i)
      self.DbToText()
      idList=self.getIdList()

  def getIdList(self):
    idList=[]
    c.execute(findByStatus,(0,))  
    for i in c:
      idList.append(i[0])
    c.close()
    return(idList)

  def DbToText(self):
    c.execute(findContentNotEmpty)
    txtList=glob.glob('*.txt')
    txtList=[i[0:-4] for i in txtList]
    for i in c:
      id=i[0]
      content=i[1].replace(r'<br />','').replace('\n','')
      if id not in txtList:
        fileName=self.makeFileName(id)
        open(fileName,'w').write(content.encode('gbk'))

  def makeFileName(self,i):
    return(str(i)+'.txt')

def main():
  d=downloader()

if __name__=='__main__':
  main()