灌水爬了几本书

snake · 2016年03月19日 · 最后由 xiangtao8112 回复于 2016年03月31日 · 6037 次阅读

前面恒温发了个帖子《计算机开放电子书汇总》，于是就有了后面的故事。。。

自娱自乐，写了个脚本，爬了几本书。自己都觉得写得比较渣，很 low.
被安吉利拉看到了，说让我共享出来，造福大众。

挣扎了很久，还是发出来，灌下水。。。

先看下效果：
运行情况：

运行结果：

代码献上：

__author__ = 'anderson'


#coding=utf-8
import urllib
import re
import os

def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    return html

def getlink(html,type):
    pattern = re.compile('<li>  <a href="(.*?)" target="_blank">(.*?)</a>',re.S)
    source = re.compile(pattern)
    sourcelist = re.findall(source,html)
    for item in sourcelist:
        print item[0],item[1]
        if item[1] != r"在线阅读":
            #print item[0]
            if type in item[1]:
                os.popen("curl -O %s" %(item[0]))

type = """
    1.pdf
    2.epub
    3.mobi"""

print type

html = getHtml("https://testerhome.com/topics/4419")


yourtype = raw_input("please input your type: ")

downtype = ""
if yourtype != '':
    if yourtype =='1':
        downtype = 'PDF'
    elif yourtype =='2':
        downtype = 'EPUB'
    elif yourtype =='3':
        downtype = 'MOBI'
    else:
        print "please check your choice!"
else:
    print "please choose one type!"        

getlink(html,downtype)

不足之处：
链接为 “下载地址”，属于不同的网站，正则难以匹配，没去管它。

纯属自娱自乐，欢迎拍砖。

根据大家的意见，改进了一下，
以下是改进后的：

__author__ = 'anderson'


#coding=utf-8
import urllib
import re
import os
import threading,time
from time import sleep, ctime

class get_book():

    def __init__(self,nsec, url, types) :
          self.types = types
          self.nsec = nsec
          self.url = url

    def now(self) :
        return str( time.strftime( '%Y-%m-%d %H:%M:%S' , time.localtime() ) )

    def getHtml(self,url):
        page = urllib.urlopen(self.url)
        html = page.read()
        return html

    def getlink(self,html,types):
        pattern = re.compile('<li>  <a href="(.*?)" target="_blank">(.*?)</a>',re.S)
        source = re.compile(pattern)
        sourcelist = re.findall(source,html)
        for item in sourcelist:
            print item[0],item[1]
            if self.types in item[1]:
                os.popen("curl -O %s" %(item[0]))
                sleep(self.nsec)


def main():

    types = {1:"PDF",2:"EPUB",3:"MOBI"}

    print types.items()

    html = "https://testerhome.com/topics/4419"

    yourtype = raw_input("please input your type: ")

    if int(yourtype) not in range (1,3):
        print "please select again"
    else:
        print types[int(yourtype)]

        get = get_book(2,html,types[int(yourtype)])
        url = get.getHtml(html)

        print 'starting at:',get.now()
        threadpool=[]

        for i in xrange(10):
            th = threading.Thread(target= get.getlink,args=(url,types[int(yourtype)]))
            threadpool.append(th)

        for th in threadpool:
            th.start()

        for th in threadpool :
            threading.Thread.join( th )

        print 'all Done at:', get.now()

if __name__ == '__main__':
        main()