Python爬取读者并制作成PDF_python教程-查字典教程网

学了下beautifulsoup后,做个个网络爬虫,爬取读者杂志并用reportlab制作成pdf..

crawler.py

复制代码代码如下:

#!/usr/bin/env python

#coding=utf-8

"""

Author: Anemone

Filename: getmain.py

Last modified: 2015-02-19 16:47

E-mail: anemone@82flex.com

"""

import urllib2

from bs4 import BeautifulSoup

import re

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

def getEachArticle(url):

# response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')

response = urllib2.urlopen(url)

html = response.read()

soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk"))

#for i in soup.find_all('div'):

# print i,1

title=soup.find("h1").string

writer=soup.find(id="pub_date").string.strip()

_from=soup.find(id="media_name").string.strip()

text=soup.get_text()#.encode("utf-8")

main=re.split("BAIDU_CLB.*;",text)

result={"title":title,"writer":writer,"from":_from,"context":main[1]}

return result

#new=open("new.txt","w")

#new.write(result["title"]+"nn")

#new.write(result["writer"]+" "+result["from"])

#new.write(result["context"])

#new.close()

def getCatalog(issue):

url="http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"

firstUrl=url+"duzh"+issue+"01.html"

firstUrl=url+"index.html"

duzhe=dict()

response = urllib2.urlopen(firstUrl)

html = response.read()

soup=BeautifulSoup(html)

firstUrl=url+soup.table.a.get("href")

response = urllib2.urlopen(firstUrl)

html = response.read()

soup = BeautifulSoup(html)

all=soup.find_all("h2")

for i in all:

print i.string

duzhe[i.string]=list()

for link in i.parent.find_all("a"):

href=url+link.get("href")

print href

while 1:

try:

article=getEachArticle(href)

break

except:

continue

duzhe[i.string].append(article)

return duzhe

def readDuZhe(duzhe):

for eachColumn in duzhe:

for eachArticle in duzhe[eachColumn]:

print eachArticle["title"]

if __name__ == '__main__':

# issue=raw_input("issue(201501):")

readDuZhe(getCatalog("201424"))

getpdf.py

复制代码代码如下:

#!/usr/bin/env python

#coding=utf-8

"""

Author: Anemone

Filename: writetopdf.py

Last modified: 2015-02-20 19:19

E-mail: anemone@82flex.com

"""

#coding=utf-8

import reportlab.rl_config

from reportlab.pdfbase import pdfmetrics

from reportlab.pdfbase.ttfonts import TTFont

from reportlab.lib import fonts

import copy

from reportlab.platypus import Paragraph, SimpleDocTemplate,flowables

from reportlab.lib.styles import getSampleStyleSheet

import crawler

def writePDF(issue,duzhe):

reportlab.rl_config.warnOnMissingFontGlyphs = 0

pdfmetrics.registerFont(TTFont('song',"simsun.ttc"))

pdfmetrics.registerFont(TTFont('hei',"msyh.ttc"))

fonts.addMapping('song', 0, 0, 'song')

fonts.addMapping('song', 0, 1, 'song')

fonts.addMapping('song', 1, 0, 'hei')

fonts.addMapping('song', 1, 1, 'hei')

stylesheet=getSampleStyleSheet()

normalStyle = copy.deepcopy(stylesheet['Normal'])

normalStyle.fontName ='song'

normalStyle.fontSize = 11

normalStyle.leading = 11

normalStyle.firstLineIndent = 20

titleStyle = copy.deepcopy(stylesheet['Normal'])

titleStyle.fontName ='song'

titleStyle.fontSize = 15

titleStyle.leading = 20

firstTitleStyle = copy.deepcopy(stylesheet['Normal'])

firstTitleStyle.fontName ='song'

firstTitleStyle.fontSize = 20

firstTitleStyle.leading = 20

firstTitleStyle.firstLineIndent = 50

smallStyle = copy.deepcopy(stylesheet['Normal'])

smallStyle.fontName ='song'

smallStyle.fontSize = 8

smallStyle.leading = 8

story = []

story.append(Paragraph("读者{0}期".format(issue), firstTitleStyle))

for eachColumn in duzhe:

story.append(Paragraph('__'*28, titleStyle))

story.append(Paragraph('{0}'.format(eachColumn), titleStyle))

for eachArticle in duzhe[eachColumn]:

story.append(Paragraph(eachArticle["title"],normalStyle))

story.append(flowables.PageBreak())

for eachColumn in duzhe:

for eachArticle in duzhe[eachColumn]:

story.append(Paragraph("{0}".format(eachArticle["title"]),titleStyle))

story.append(Paragraph(" {0} {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle))

para=eachArticle["context"].split("")

for eachPara in para:

story.append(Paragraph(eachPara,normalStyle))

story.append(flowables.PageBreak())

#story.append(Paragraph("context",normalStyle))

doc = SimpleDocTemplate("duzhe"+issue+".pdf")

print "Writing PDF..."

doc.build(story)

def main(issue):

duzhe=crawler.getCatalog(issue)

writePDF(issue,duzhe)

if __name__ == '__main__':

issue=raw_input("Enter issue(201501):")

main(issue)

以上就是本文的全部内容了，希望大家能够喜欢。