前面恒温发了个帖子《计算机开放电子书汇总》, 于是就有了后面的故事。。。
自娱自乐,写了个脚本,爬了几本书。 自己都觉得写得比较渣,很 low.
被安吉利拉看到了,说让我共享出来,造福大众。
挣扎了很久,还是发出来,灌下水。。。
先看下效果:
运行情况:
运行结果:
代码献上:
__author__ = 'anderson'
#coding=utf-8
import urllib
import re
import os
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getlink(html,type):
pattern = re.compile('<li> <a href="(.*?)" target="_blank">(.*?)</a>',re.S)
source = re.compile(pattern)
sourcelist = re.findall(source,html)
for item in sourcelist:
print item[0],item[1]
if item[1] != r"在线阅读":
#print item[0]
if type in item[1]:
os.popen("curl -O %s" %(item[0]))
type = """
1.pdf
2.epub
3.mobi"""
print type
html = getHtml("https://testerhome.com/topics/4419")
yourtype = raw_input("please input your type: ")
downtype = ""
if yourtype != '':
if yourtype =='1':
downtype = 'PDF'
elif yourtype =='2':
downtype = 'EPUB'
elif yourtype =='3':
downtype = 'MOBI'
else:
print "please check your choice!"
else:
print "please choose one type!"
getlink(html,downtype)
不足之处:
链接为 “下载地址”,属于不同的网站,正则难以匹配,没去管它。
纯属自娱自乐,欢迎拍砖。
根据大家的意见,改进了一下,
以下是改进后的:
__author__ = 'anderson'
#coding=utf-8
import urllib
import re
import os
import threading,time
from time import sleep, ctime
class get_book():
def __init__(self,nsec, url, types) :
self.types = types
self.nsec = nsec
self.url = url
def now(self) :
return str( time.strftime( '%Y-%m-%d %H:%M:%S' , time.localtime() ) )
def getHtml(self,url):
page = urllib.urlopen(self.url)
html = page.read()
return html
def getlink(self,html,types):
pattern = re.compile('<li> <a href="(.*?)" target="_blank">(.*?)</a>',re.S)
source = re.compile(pattern)
sourcelist = re.findall(source,html)
for item in sourcelist:
print item[0],item[1]
if self.types in item[1]:
os.popen("curl -O %s" %(item[0]))
sleep(self.nsec)
def main():
types = {1:"PDF",2:"EPUB",3:"MOBI"}
print types.items()
html = "https://testerhome.com/topics/4419"
yourtype = raw_input("please input your type: ")
if int(yourtype) not in range (1,3):
print "please select again"
else:
print types[int(yourtype)]
get = get_book(2,html,types[int(yourtype)])
url = get.getHtml(html)
print 'starting at:',get.now()
threadpool=[]
for i in xrange(10):
th = threading.Thread(target= get.getlink,args=(url,types[int(yourtype)]))
threadpool.append(th)
for th in threadpool:
th.start()
for th in threadpool :
threading.Thread.join( th )
print 'all Done at:', get.now()
if __name__ == '__main__':
main()