|
|
本帖最后由 lovees 于 2019-4-28 22:34 编辑
Smart需要的知轩藏书的爬虫,源码放出
依旧是Python + pyspider ,因为pyspider框架可以WEB进行管理,方便管理,只要把代码复制进去运行就可
博客中文章地址:http://www.sxsay.com/872.html
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- # Created on 2019-04-28 1:24:37
- # Project: Zxcs
- from pyspider.libs.base_handler import *
- import re
- import os
- import codecs
- import sys
- import urllib2,HTMLParser,re
- class Handler(BaseHandler):
- global Datos
- Datos = {}
- global P_dir
- #小说下载保存路径
- P_dir = '/Home/Book'
- headers= {
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Encoding':'gzip, deflate, sdch',
- 'Accept-Language':'zh-CN,zh;q=0.8',
- 'Cache-Control':'max-age=0',
- 'Connection':'keep-alive',
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
- }
- crawl_config = {
- 'headers' : headers,
- 'timeout' : 300
- }
- crawl_config = {
- }
- def __init__(self):
- #网址请自行修改
- self.base_url1 = 'http://***.me/sort/'
- self.base_url2 = 'page/'
- self.CaterId = []
- self.CaterIds = ['23', '25', '26', '27', '28', '29', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '55']
- self.page_num = 1
- self.total_num = 5
- @every(minutes=24 * 60)
- def on_start(self):
- global Cater_Name
- #自定义一下分类的名称(习惯了,因为自己入库时候需要用到)
- Cater_Name = []
- while self.page_num <= self.total_num:
- for self.CaterId in self.CaterIds:
- if self.CaterId == '26':
- Cater_Name = '玄幻'
- if self.CaterId == '38':
- Cater_Name = '玄幻'
- if self.CaterId == '39':
- Cater_Name = '玄幻'
- if self.CaterId == '25':
- Cater_Name = '武侠'
- if self.CaterId == '36':
- Cater_Name = '武侠'
- if self.CaterId == '37':
- Cater_Name = '武侠'
- if self.CaterId == '28':
- Cater_Name = '历史'
- if self.CaterId == '42':
- Cater_Name = '历史'
- if self.CaterId == '43':
- Cater_Name = '历史'
- if self.CaterId == '23':
- Cater_Name = '都市'
- if self.CaterId == '27':
- Cater_Name = '科幻'
- if self.CaterId == '40':
- Cater_Name = '科幻'
- if self.CaterId == '41':
- Cater_Name = '科幻'
- if self.CaterId == '29':
- Cater_Name = '游戏'
- if self.CaterId == '44':
- Cater_Name = '游戏'
- if self.CaterId == '45':
- Cater_Name = '游戏'
- if self.CaterId == '55':
- Cater_Name = '都市'
- print self.CaterId
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
- self.crawl(url, callback=self.index_page,save=Cater_Name)
- self.page_num += 1
- @config(age=10 * 24 * 60 * 60)
- def index_page(self, response):
- for each in response.doc('#plist dd > div > a[href^="http"]').items():
- self.crawl(each.attr.href, callback=self.domain_page)
- def domain_page(self, response):
- for each in response.doc('div[class="filecont"] a[href^="http"]').items():
- Down = each.attr.href
- Name = response.doc('.filetit > a').text()
- self.crawl(each.attr.href, callback=self.detail_page)
- @config(priority=2)
- def detail_page(self, response):
- file_name = response.doc('h2').text()
- for each in response.doc('.panel-body a[href^="http"]').items():
- Down = each.attr.href
- if(self.download(P_dir,file_name,Down)):
- print('attachment url is ' + Down)
- return {
- "url": response.url,
- "title": response.doc('title').text(),
- "Down": Down,
- "file_name":response.doc('h2').text(),
- }
- #文件下载
- def download(self, P_dir, file_name, Down):
- file = P_dir + "/" + file_name + ".rar"
- f = open(file, "wb+")
- print f
- response = urllib2.urlopen(Down)
- f.write(response.read())
- f.close()
复制代码
有空再放出入库源码和POST提交源码,方便对接比如火车头之类的发布接口
@Smart |
|