一个国人编写的强大的网络爬虫系统并带有强大的 WebUI。采用 Python 语言编写,分布式架构,支持多种数据库后端,强大的 WebUI 支持脚本编辑器,任务监视器,项目管理器以及结果查看器。在线示例: http://demo.pyspider.org/
https://github.com/binux/pyspider
pip uninstall pycurl
export PYCURL_SSL_LIBRARY=openssl
pip install pycurl
pip install jsmin
pip uninstall jsmin
pip install pyspider
报错日志:
ValueError: Invalid configuration:
- Deprecated option 'domaincontroller': use 'http_authenticator.domain_controller' instead.
pipenv install wsgidav==2.4.1
https://segmentfault.com/q/1010000015429020?utm_source=tag-newest
获取淘宝的链接地址
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2020-01-28 18:08:45
# Project: testdemo1
"""
爬虫某宝链接地址
"""
from pyspider.libs.base_handler import *
from six import itervalues
import MySQLdb
import redis
class SQL():
# 数据库初始化
def __init__(self):
# 数据库连接相关信息
hosts = '192.168.1.103'
username = 'root'
password = '123321'
database = 'pyspider'
charsets = 'utf8'
self.connection = False
try:
self.conn = MySQLdb.connect(host=hosts, port=8888, user=username, passwd=password, db=database,
charset=charsets)
self.cursor = self.conn.cursor()
self.cursor.execute("set names " + charsets)
self.connection = True
except Exception as e:
print("Cannot Connect To Mysql!/n", e)
def escape(self, string):
return '%s' % string
# 插入数据到数据库
def insert(self, tablename=None, **values):
if self.connection:
tablename = self.escape(tablename)
if values:
_keys = ",".join(self.escape(k) for k in values)
_values = ",".join(['%s', ] * len(values))
sql_query = "insert into %s (%s) values (%s)" % (tablename, _keys, _values)
else:
sql_query = "replace into %s default values" % tablename
try:
if values:
self.cursor.execute(sql_query, list(itervalues(values)))
else:
self.cursor.execute(sql_query)
self.conn.commit()
return True
except Exception as e:
print("An Error Occured: ", e)
return False
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('www.taobao.com', callback=self.index_page)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
self.crawl(each.attr.href, callback=self.detail_page)
@config(priority=2)
def detail_page(self, response):
print("######### response url #########" + str(response.url))
return {
"url": response.url,
"title": response.doc('title').text(),
}
def on_result(self, result):
print("##################")
if not result or not result['url']:
return
print(result)
r = redis.Redis(host='127.0.0.1', port=6379, db=0)
r.lpush("url", result['url'])
SQL().insert('t_pyspider_project', **result)
pyspider --config config.json
全局配置
{
"taskdb": "mysql+taskdb://username:password@host:port/taskdb",
"projectdb": "mysql+projectdb://username:password@host:port/projectdb",
"resultdb": "mysql+resultdb://username:password@host:port/resultdb",
"message_queue": "amqp://username:password@host:port/%2F",
"webui": {
"username": "some_name",
"password": "some_passwd",
"need-auth": true
}
}
pyspider all
pyspider one
把写的脚本上传到 github 仓库中
https://github.com/xinxi1990/pyspiderScript.git
https://zhuanlan.zhihu.com/p/39199546
https://www.jianshu.com/p/df34d9b2f248
https://www.cntofu.com/book/156/api/api5.md