主要用到的模块:Tkinter,tkFileDialog,urllib2
search_html.py
# encoding: utf-8
import urllib,urllib2,requests,re
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#抓取盗墓笔记小说
def gethtml(url):
txt2=urllib2.urlopen(url)
html=txt2.read()
return html
> def get_content(html):
title=get_title(html)
#文章内容
content_re=re.compile('<p>(.*?)</p>')
content=re.findall(content_re,html)
#去掉超链接和图片
none_re=re.compile('<a href=.*?>|</a>|<img.*?>')
re_h=re.compile('</?\w+[^>]*>')#HTML标签
#换行符转换
br_re=re.compile('<br>')
fp=open(u'%s.txt'%title,'wb')
content1=''
for i in content:#此处要这样输出,要不然会有乱码
i=re.sub(re_h,'',i)
fp.write(i.strip()+'\n')
content1=content1+i
fp.close()
return content1
def get_title(html):
#标题
title_re=re.compile('<h1>(.*?)</h1>')
#搜索文章标题,并去掉文件标题可能含有的特殊符号
title=re.search(title_re,html)
title=title.group(1).replace('\\','').replace('/','').replace(':','').replace('*','').replace('?','').replace('"','').replace('>','').replace('<','').replace('|','').replace(' ','')
#print title
return title
if __name__=='__main__':
url='http://www.daomubiji.com/qi-xing-lu-wang-'#盗墓笔记网页,通过拼接的方式获取网页元素
url2='http://www.idaomu.com/qin-ling-shen-shu-'
for i in range(1,41):
j=str(i)
html=url2+j.zfill(2)+'.html'#自动把一位数变成两位数
print html
html_content=gethtml(html)
get_content( html_content)
TK_HTML.py
# encoding: utf-8
import Tkinter,tkFileDialog
import search_html
class main_frm:
def __init__(self,CONTENT,TITLE_MENU,filepath):
# define options for opening or saving a file
self.root=Tkinter.Tk()
self.file_opt = options = {}
options['defaultextension'] = '.txt'
options['filetypes'] = [('all files', '.*'), ('text files', '.txt')]
options['initialdir'] = 'C:\Users\DELL\Desktop\auto_android_v365\daomubiji'
options['initialfile'] = 'myfile.txt'
options['parent'] = self.root
options['title'] = 'This is a title'
self.root.title(u'盗墓笔记') #框架标题
self.root.geometry('500x700') #框架大小
self.root.resizable(width=True,height=True) #宽不可变, 高可变,默认为True
self.frm=Tkinter.Frame(self.root)
self.Variable=Tkinter.StringVar(self.root)
#self.Variable.set("秦岭神树篇第一章老痒出狱")
self.text=Tkinter.Text(self.root)
self.list_button=Tkinter.Button(self.frm,text="阅读下一章",command=self.openfile) #打开按钮
self.save_button=Tkinter.Button(self.frm,text='另存为',command=self.save_button) #保存按钮
#退出按钮
self.quit=Tkinter.Button(self.root,text='退出阅读',command=self.root.quit,bg='red',fg='white')
self.main_top(CONTENT,TITLE_MENU)
self.main_loop()
self.root.mainloop()
def main_top(self,content,title_menu):
self.Variable.set(title_menu)
self.menu=Tkinter.OptionMenu(self.frm,self.Variable,title_menu)
content=self.openfile()
for i in content:
self.text.insert(1.0,i)
print i
def main_loop(self):
self.frm.pack()
self.menu.pack(fill =Tkinter.Y,expand = 1,side = Tkinter.LEFT)
self.list_button.pack(fill =Tkinter.BOTH,expand = 1,side = Tkinter.RIGHT)
self.save_button.pack(fill =Tkinter.BOTH,expand = 1,side = Tkinter.RIGHT)
self.text.pack(fill =Tkinter.BOTH,expand = 1)
self.quit.pack(side=Tkinter.BOTTOM)
def openfile(self):
self.filename=tkFileDialog.askopenfilename(**self.file_opt)
with open(self.filename) as in_file: #读取打开文本的内容
result=in_file.readlines()
return result
def save_button(self):
filename = tkFileDialog.asksaveasfilename(**self.file_opt)
return filename
if __name__=='__main__':
filepath='C:\Users\DELL\Desktop\auto_android_v365\daomubiji' #直接通过本地路径获取文本
url1='http://www.idaomu.com/qin-ling-shen-shu-' #直接抓取网页的内容
main=main_frm
t=''
for i in range(1,2):
j=str(i)
url=url1+j.zfill(2)+'.html'#自动把一位数变成两位数
html=search_html.gethtml(url)
content=search_html.get_content(html)
title_menu=search_html.get_title(html)
main(content,title_menu,filepath)
最近在学习 python,所以想写一个工具来爬小说《盗墓笔记》,此话题仅限个人学习,不知能不能审核通过,我想以此来记录我的学习过程