python实现爬虫下载漫画示例_python教程-查字典教程网

复制代码代码如下:

#!/usr/bin/python3.2

import os,socket

import urllib

import urllib.request,threading,time

import re,sys

global manhuaweb,weburl,floder,chapterbegin,currentthreadnum,threadcount,mutex,mutex2

weburl=''

floder=''

chapterbegin=0

currentthreadnum=0

threadcount=6

if len(sys.argv)>=3:

weburl=sys.argv[1]

floder=sys.argv[2]

else:

print("usag: downloadmanhua weburl floder chapterbegin=0 threadnnum=6")

sys.exit(0)

if len(sys.argv)>=4:

chapterbegin=int(sys.argv[3])

if len(sys.argv)>=5:

threadcount=(int)(sys.argv[4])

def jin(i,jinzhi):

finalans=""

answer=i%jinzhi

i=int(i/jinzhi)

if answer>9:

finalans=finalans+chr(ord('a')+(answer-10))

else:

finalans=finalans+str(answer)

if i!=0:

finalans=jin(i,jinzhi)+finalans

return finalans

def urlparse(p,a,c,k):

d={}

e=lambda c: jin(c,36)

if 1:

while c:

c=c-1

if not k[c]:

d[jin(c,36)]=jin(c,36)

else:

d[jin(c,36)]=k[c]

k=[lambda e:d[e]]

e=lambda c:'w+'

c=1

newstr=""

while c:

c=c-1

if k[c]:

for i in range(0,len(p)):

tempi=p[i]

tempi=ord(tempi)

if tempi>=ord('a') and tempi<=ord('f'):

newstr+=d[chr(tempi)]

elif tempi>=ord('0') and tempi<=ord('9'):

newstr+=d[chr(tempi)]

else:

newstr+=chr(tempi)

return newstr

def meispower(s):

p=re.compile(r"(?=}().*",re.IGNORECASE)

s=p.findall(s)

s=s[0]

s=s[0:(len(s)-19)]

par=s.split(',')

par[3]=par[3][1:len(par[3])]

answer=par[3].split('|')

chapterpath=urlparse(par[0],int(par[1]),int(par[2]),answer)

allurl=re.findall('imgpath=[^;]*',chapterpath)[0]

allurl=allurl[10:(len(allurl)-2)]

return allurl

def pictofile(weburl,filename,loop=100):

if loop<0:

print('can't download the picture %s'%weburl)

return

loop=loop-1

if os.path.exists(filename):

return

try:

url=urllib.request.urlopen(weburl)

data=url.read()

if len(data)<2048:

url.close()

pictofile(weburl,filename,loop)

else:

print('download from %s name is %sn'%(weburl,filename))

myfile=open('%s'%filename,'wb')

myfile.write(data)

myfile.close()

url.close();

except socket.timeout:

print('timeout')

pictofile(weburl,filename,loop)

except Exception as e:

print('error',e)

pictofile(weburl,filename,loop)

finally:

pass

def downloadpic(url,loadpicdir,num):

#download the all url picture to loadpicdir

global currentthreadnum,mutex,mutex2

mymode=re.compile(r'[0-9a-z.]*Z')

try:

mutex2.acquire()

os.chdir(loadpicdir)

mutex2.release()

except:

print("can't open the floder %s will be create"%loadpicdir)

try:

if(mutex2.locked()):

os.mkdir(loadpicdir)

os.chdir(loadpicdir)

mutex2.release()

print('create floder succeed')

except:

print("can't create floder %s"%loadpicdir)

if(mutex.acquire()):

mutex.release()

quit(0)

name=mymode.findall(url)

filename='manhua'+name[0]

pictofile(url,loadpicdir+'//'+str(num)+'-'+filename)

mutex.acquire()

currentthreadnum=currentthreadnum-1

mutex.release()

def downloadchapter(url,loadpicdir,num,begin=0):

global manhuaweb,threadcount,currentthreadnum,mutex

print(manhuaweb+url)

webdata=urllib.request.urlopen(manhuaweb+url).read()

webdata=webdata.decode('UTF-8')

chaptername=re.findall(r'<title>[^_]*',webdata)[0]

chaptername=chaptername[7:len(chaptername)]

webscrip=re.findall(r'eval.*[^<>]',webdata)

chapterurl=meispower(webscrip[0]);

chapterurl='http://mhimg.ali213.net'+chapterurl

for i in range(begin,num):

try:

while(currentthreadnum>=threadcount):

time.sleep(0.5)

mutex.acquire()

currentthreadnum=currentthreadnum+1

mutex.release()

threading.Thread(target=downloadpic,args=(r'%s%d.jpg'%(chapterurl,i),loadpicdir+chaptername,num)).start()

except socket.error:

mutex.acquire()

i=i-1

currentthreadnum=currentthreadnum-1

mutex.release()

except Exception as error:

print(error,'break')

print('download chapter %d of picture make a error'%i)

break

if __name__=='__main__':

manhuaweb=r'http://manhua.ali213.net'

socket.setdefaulttimeout(60.0)

mutex=threading.Lock()

mutex2=threading.Lock()

webfile=urllib.request.urlopen(weburl)

webdata=webfile.read();

webdata=webdata.decode('UTF-8')

meshmode=re.compile(r'<div>.*</div>')

meshdata=meshmode.findall(webdata)[0]

indexmode=re.compile(r'([0-9]*页)')

indexdata=indexmode.findall(meshdata)

picurlmode=re.compile(r'/comic/[0-9/]*.html')

picurldata=picurlmode.findall(meshdata)

chapterlength=len(picurldata)

nummode=re.compile(r'[d]+')

i=chapterbegin

while i<chapterlength:

manhuachapter=picurldata[chapterlength-i-1]

downloadchapter(manhuachapter,floder,int(nummode.findall(indexdata[chapterlength-i-1])[0]))

i=i+1