第一步:获取小说网页信息 ;
第二步:获取小说的章节标题和可访问的url;
第三步:获取各章节的小说内容;
第四步:把章节标题和内容写入文件
语言: python 3.x
第三方库:requests和re
浏览器:Chrome
IDE:Pycharm或VScode
第一种方法:面向过程
import requests
import re
#取得所有小说章节的urls
url = 'http://www.biqukan.com/1_1094/'
req = requests.get(url).text
webs = re.findall(r'<dd><a href ="(.*?)">(.*?)</a></dd>',req)[12:] #去掉前11个更新的章节url
# 打开或新建文件
f = open('一念永恒1.txt','w', encoding='utf-8')
# 获取所有章节的可访问url和title
for web in webs:
novel_title = web[1]
novel_urls = web[0]
if 'http' not in web:
novel_urls = 'http://www.biqukan.com%s' % web[0]
#获取所有章节的内容
html = requests.get(novel_urls).text
novel_content = re.findall(r'<div id="content" class="showtxt">(.*?)</div>',html)[0]
novel_content = novel_content.replace(' ','')
novel_content = novel_content.replace('<br />','')
novel_content = novel_content.replace('【感谢大家一直以来的支持,这次起-点515粉丝节的作家荣耀堂和作品总选举,希望都能支持一把。另外粉丝节还有些红包礼包的,领一领,把订阅继续下去!','')
novel_content = novel_content.replace('请记住本书首发域名:www.biqukan.com。笔趣阁手机版阅读网址:m.biqukan.com','')
novel_content = novel_content.replace(novel_urls,'')
# 把内容写入文件
f.write(novel_title + '\n')
f.write(novel_content + '\n\n')
f.close()
第二种方法:面向对象
import requests
import re
# 定义下载小说的类
class download_novel(object):
def __init__(self, url):
self.url = url
#1 获取所有章节的url
def get_urls(self):
req = requests.get(self.url).text
novel_urls = re.findall(r'<dd><a href ="(.*?)">(.*?)</a></dd>',req)[12:] #去掉前11个更新的章节url
return novel_urls
#2 获取每个章节的内容
def get_content(self, novel_urls):
html = requests.get(novel_urls).text
novel_content = re.findall(r'<div id="content" class="showtxt">(.*?)</div>',html)[0]
novel_content = novel_content.replace(' ','')
novel_content = novel_content.replace('<br />','')
novel_content = novel_content.replace('【感谢大家一直以来的支持,这次起-点515粉丝节的作家荣耀堂和作品总选举,希望都能支持一把。另外粉丝节还有些红包礼包的,领一领,把订阅继续下去!','')
novel_content = novel_content.replace('请记住本书首发域名:www.biqukan.com。笔趣阁手机版阅读网址:m.biqukan.com','')
novel_content = novel_content.replace(novel_urls,'')
return novel_content
#3 写入文件
def write_novel(self):
novel_urls = self.get_urls()
with open('一念永恒.txt','w', encoding='utf-8') as f:
for web in novel_urls:
novel_title = web[1]
novel_urls = web[0]
if 'http' not in web:
novel_urls = 'http://www.biqukan.com%s' % web[0]
novel_content = self.get_content(novel_urls)
print(novel_urls)
f.write(novel_title + '\n')
f.write(novel_content + '\n\n')
# 实例
if __name__ == '__main__':
url = 'http://www.biqukan.com/1_1094/'
a = download_novel(url)
a.write_novel()
还有面向函数的方法,这里就不列举了。面向对象的方法看起来比较复杂,但是它在以后的维护和更新效率上有很大的帮助,慢慢掌握它才是上道。