新手区用 python 写的一个爬虫工具

xuxiujin · 2016年08月31日 · 最后由 gang 回复于 2016年09月11日 · 4796 次阅读

python 学习

主要用到的模块：Tkinter,tkFileDialog，urllib2

 search_html.py
# encoding: utf-8
import urllib,urllib2,requests,re
import sys
reload(sys)
sys.setdefaultencoding('utf8')

#抓取盗墓笔记小说
def gethtml(url):
    txt2=urllib2.urlopen(url)
    html=txt2.read()
    return html

> def get_content(html):

    title=get_title(html)
    #文章内容
    content_re=re.compile('<p>(.*?)</p>')
    content=re.findall(content_re,html)
    #去掉超链接和图片
    none_re=re.compile('<a href=.*?>|</a>|<img.*?>')
    re_h=re.compile('</?\w+[^>]*>')#HTML标签
    #换行符转换
    br_re=re.compile('<br>')
    fp=open(u'%s.txt'%title,'wb')

    content1=''
    for i in content:#此处要这样输出，要不然会有乱码
        i=re.sub(re_h,'',i)
        fp.write(i.strip()+'\n')
        content1=content1+i
    fp.close()
    return content1

def get_title(html):
     #标题
    title_re=re.compile('<h1>(.*?)</h1>')
    #搜索文章标题，并去掉文件标题可能含有的特殊符号
    title=re.search(title_re,html)
    title=title.group(1).replace('\\','').replace('/','').replace(':','').replace('*','').replace('?','').replace('"','').replace('>','').replace('<','').replace('|','').replace(' ','')
    #print title
    return title

if __name__=='__main__':
    url='http://www.daomubiji.com/qi-xing-lu-wang-'#盗墓笔记网页，通过拼接的方式获取网页元素
    url2='http://www.idaomu.com/qin-ling-shen-shu-'
    for i in range(1,41):
        j=str(i)
        html=url2+j.zfill(2)+'.html'#自动把一位数变成两位数
        print html
        html_content=gethtml(html)
        get_content( html_content)


TK_HTML.py
# encoding: utf-8
import Tkinter,tkFileDialog
import search_html
class main_frm:
    def __init__(self,CONTENT,TITLE_MENU,filepath):
         # define options for opening or saving a file
        self.root=Tkinter.Tk()
        self.file_opt = options = {}
        options['defaultextension'] = '.txt'
        options['filetypes'] = [('all files', '.*'), ('text files', '.txt')]
        options['initialdir'] = 'C:\Users\DELL\Desktop\auto_android_v365\daomubiji'
        options['initialfile'] = 'myfile.txt'
        options['parent'] = self.root
        options['title'] = 'This is a title'
        self.root.title(u'盗墓笔记')  #框架标题
        self.root.geometry('500x700')  #框架大小
        self.root.resizable(width=True,height=True)  #宽不可变, 高可变,默认为True
        self.frm=Tkinter.Frame(self.root)
        self.Variable=Tkinter.StringVar(self.root)
        #self.Variable.set("秦岭神树篇第一章老痒出狱")
        self.text=Tkinter.Text(self.root)
        self.list_button=Tkinter.Button(self.frm,text="阅读下一章",command=self.openfile) #打开按钮
        self.save_button=Tkinter.Button(self.frm,text='另存为',command=self.save_button)  #保存按钮
        #退出按钮
        self.quit=Tkinter.Button(self.root,text='退出阅读',command=self.root.quit,bg='red',fg='white')
        self.main_top(CONTENT,TITLE_MENU)
        self.main_loop()
        self.root.mainloop()

    def main_top(self,content,title_menu):
        self.Variable.set(title_menu)
        self.menu=Tkinter.OptionMenu(self.frm,self.Variable,title_menu)
        content=self.openfile()
        for i in content:
            self.text.insert(1.0,i)
            print i

    def main_loop(self):
        self.frm.pack()
        self.menu.pack(fill =Tkinter.Y,expand = 1,side = Tkinter.LEFT)
        self.list_button.pack(fill =Tkinter.BOTH,expand = 1,side = Tkinter.RIGHT)
        self.save_button.pack(fill =Tkinter.BOTH,expand = 1,side = Tkinter.RIGHT)
        self.text.pack(fill =Tkinter.BOTH,expand = 1)
        self.quit.pack(side=Tkinter.BOTTOM)

    def openfile(self):
        self.filename=tkFileDialog.askopenfilename(**self.file_opt)
        with open(self.filename) as in_file:     #读取打开文本的内容
            result=in_file.readlines()
        return result
    def save_button(self):
        filename = tkFileDialog.asksaveasfilename(**self.file_opt)
        return filename

if __name__=='__main__':
    filepath='C:\Users\DELL\Desktop\auto_android_v365\daomubiji'  #直接通过本地路径获取文本
    url1='http://www.idaomu.com/qin-ling-shen-shu-'    #直接抓取网页的内容
    main=main_frm
    t=''
    for i in range(1,2):
        j=str(i)
        url=url1+j.zfill(2)+'.html'#自动把一位数变成两位数
        html=search_html.gethtml(url)
        content=search_html.get_content(html)
        title_menu=search_html.get_title(html)
        main(content,title_menu,filepath)

最近在学习 python，所以想写一个工具来爬小说《盗墓笔记》，此话题仅限个人学习，不知能不能审核通过，我想以此来记录我的学习过程

共收到 2 条回复时间点赞

jack2795 #1 · 2016年08月31日

go on！

gang #1 · 2016年09月11日

感觉没啥说的。。。 BeautifulSoup 吧，当然 re 也没什么不好
溜~

需要登录后方可回复, 如果你还没有账号请点击这里注册。

新手区 用 python 写的一个爬虫工具

新手区 用 python 写的一个爬虫工具

python 学习

新手区用 python 写的一个爬虫工具

新手区用 python 写的一个爬虫工具