Python + pyspider某小说站的爬虫,入数据库,火车头发布,资源下载到本地,另可写爬虫!6 {( X( n' B2 N
- #!/usr/bin/env python; |* q( K% S' k, N+ ^; Y# h; h& V2 I
- # -*- encoding: utf-8 -*-. D m9 ^) [) u( b# r: t, F2 h
- # Created on 2019-05-05 21:43:11
$ n! {! N3 I/ Z; z# L3 r - # Project: XiaoShuo
5 k& U- \- Q8 |6 T -
& a" S( t7 f% u7 n2 A+ k: T* @5 B - from pyspider.libs.base_handler import *# C+ u5 G! V; A7 L0 u! l
- import pymysql
5 `+ b3 ?6 r3 t1 K3 y& U9 m- D6 K - import random
& C3 c* o+ S) E. e/ Z - import datetime
l/ g- e0 l. D- q7 V; F - import urllib2,HTMLParser,re9 h2 w) R0 L% g
- import os4 X, ]% x1 B6 M4 t
- import sys4 i, Y y3 Q3 V7 |! J9 S- U5 O7 E, L
- import re
. ~% V' F! `( s* X0 w - import codecs* R1 b4 O: e( C" k3 _4 Y$ o; A# |
- import requests
8 u+ V7 r' ` K: I - import json* I! _, c k5 K" {+ F# e
- * p q* J- u) D! U- D# g0 o
- class Handler(BaseHandler):6 N- k4 q( y0 W3 H
- global Datos# ~* O8 J8 T( K
- global P_dir ! y9 Z7 w' S( h+ @
- P_dir = '/Tools/Debug/' #采集时候图片保持到本地的路径2 _: f. T" M4 o4 ^! I" b& _$ C
- global Datos) @( j% z( w. S% i0 d3 L, a
- Datos = {}# P2 b( Y0 C( b/ X8 O) N
- headers= {# M+ ^6 O4 X# }) \" L
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
8 w; G4 Q% I, ?4 L" X" F - 'Accept-Encoding':'gzip, deflate, sdch',/ Z/ V4 C# `& Q3 v0 x) }( z
- 'Accept-Language':'zh-CN,zh;q=0.8',- a( P5 }7 E3 N' T1 J
- 'Cache-Control':'max-age=0',+ ?9 [2 F- e# T/ i& Q. _/ N6 ]) y
- 'Connection':'keep-alive',
: e2 v' i) @. o) V) W/ \ - 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36'
0 o0 V3 U# I3 J+ F - }8 ]& v6 `/ i& }3 ^
- crawl_config = {
+ u6 p2 I) Z/ H; h% ]# l) e7 E& Y: ` - 'headers' : headers,
5 U& ~) i; E' k - 'timeout' : 300( j5 }" n8 s& Y' W
- }9 O0 I( l( H' c4 d. Z6 R1 c- q3 N
- def add_BookFile(self,Bookname, BookIDs, img, Locaimg, Book_Dates):+ o% p" W2 ~; O. L3 W
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8"); @+ A3 V5 J: U% Y" i# M" s
- try:" ^; d0 E* w( U
- cursor = db.cursor()
2 e: S- R2 X4 x) C0 y3 D2 d; [" o - #注意此处字符串的占位符要加双引号"%s"
) p4 s5 i5 a" V& {! M( ^ - sql = 'insert into BookFile(Bookname, BookID, img, Locaimg, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, BookIDs, img, Locaimg, Book_Dates);
; E N, { o+ Q4 H V - # print(sql)9 n$ k3 b2 w& `) Z& c2 l# g' |3 {4 J
- cursor.execute(sql)* J+ O( H* ?5 b, n, H) v: T
-
/ P9 {: l$ A. k) ~ - #qid = cursor.lastrowid5 t" Z- M# G# ?" U
- #print(qid)
1 P7 K* S% B' D% T* r/ Q: E o4 n1 o8 X - 9 S/ h+ W/ n6 \0 }6 R
- db.commit()
. I) l2 I# Z, Y - except Exception as err:
* v, @' r4 d5 Y% A - print("Error %s for execute sql: %s" % (err, sql))
0 B: w) S! a3 K; r0 H( k - db.rollback()$ w8 z8 e: w; L Q- o
- def add_comment(self,Bookname, Booktitle, BookID, Titleid, Book_Date):
( B% o# c+ K! Z1 O9 m- V& V" C - db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")2 r% X3 X( U& ~" E
- try:3 `4 W* r& ^2 x- m. O# J3 t' J
- cursor = db.cursor()
0 s+ H5 d9 g0 v3 G/ N* d - #注意此处字符串的占位符要加双引号"%s"
/ ?( B: c9 e7 {6 Q: H) t - sql = 'insert into BookTitle(Bookname, Booktitle, BookID, Titleid, Book_Date) values ("%s","%s","%s","%s","%s")' % (Bookname, Booktitle, BookID, Titleid, Book_Date);
5 u- H: D, b( O( o' y7 G - # print(sql)6 e, ?: B- \) f3 K
- cursor.execute(sql)
& h% O, ^5 J1 I& O - ' M: Z1 ?9 S6 [# r% q0 q
- #qid = cursor.lastrowid
8 L! H3 c" C' y - #print(qid)( H5 ?1 L$ h4 P- o M2 N8 L# R
- , Y& s* \! a, X7 o+ J( P+ j
- db.commit()7 R' W* N/ `1 G
- except Exception as err:
4 I/ ~. K2 e C - print("Error %s for execute sql: %s" % (err, sql))
E; p! N) l+ H9 F. C7 l - db.rollback()
& `- n: Z; o/ ` - def add_question(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date):' G) x! K, y8 F* f; R7 {
- db = pymysql.connect(host="localhost",user="数据库用户名",password="密码",db="数据库名",charset="utf8")! |- Q* j* n& G/ ?/ y! }/ U+ h6 G
- try:
& Y. Y! J' i" W( ]+ @9 C& g. q" t. d - cursor = db.cursor()" D# D+ f5 ]5 z5 ]1 {: {% J$ J
- #注意此处字符串的占位符要加双引号"%s"7 Z% I1 _: J2 z3 r7 G
- sql = ' INSERT INTO BookConte (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date);
# p7 x: X# P( j5 q& N: z" w" @; s* W - print(sql)6 y5 ]8 \ a- d) F! E
- cursor.execute(sql)
/ z( X3 r5 J- @5 L - print(cursor.lastrowid)9 ^7 Q8 G5 n' g0 n3 t
- db.commit()
6 u& ^1 i9 H- i( z h - except Exception as err:
- {6 B, K- |$ e5 d - # except:
, E8 Q6 E+ { Z- }2 C) H/ i - # print('Failed'): [& E; P: c2 r& g6 }4 M
- print("Error %s for execute sql: %s" % (err, sql))
& D# v) N- u2 i9 v- S - db.rollback()
1 j2 ~8 o8 D) ~6 P; ?( Z9 d* ~ -
( g6 l/ @; a- k E! G1 G+ z2 } - def add_locoy(self,Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover): q. \* r* Q0 L5 u* a5 ~5 h
- reload(sys)
7 k( C# k. C8 z6 N/ B - sys.setdefaultencoding("gbk")
& ]& s/ D) ?5 m6 _. x - locoy_url = 'http://www.******.net/locoy/?my=book' #697火车头发接口地址6 r1 \$ y. |. @# }9 A+ c" [2 |
- locoy_data = {! d" p: D6 I: h1 `( D/ @4 c: n$ n
- 'my_u':'用户名', #后台用户名
8 h) f ^& _8 |, U - 'my_p':'密码', #后台密码
8 S: R* K6 X7 \* M' Z - 'subject_669977_net':Bookname.encode('gbk', 'ignore'),# S/ q: s4 K0 _0 o" W1 a
- 'caid':Cater_Name.encode('gbk', 'ignore'),. x4 k8 Q8 i! l
- 'title_669977_net':Booktitle.encode('gbk', 'ignore'),
+ t1 K+ ?4 M7 m0 o - 'article':BookConte.encode('gbk', 'ignore'),; [2 x8 ^% E; a D$ t
- 'author':Book_author.encode('gbk', 'ignore'),
. o" `9 ?4 Z& w$ W" Q - 'ready_1':Book_Palabras.encode('gbk', 'ignore'),
: T2 i2 R9 Z7 B3 F8 q9 o7 M - 'thumb':Book_img,
( u0 v- v9 F' N! H* q! r/ l7 w - 'content':Book_Introduction.encode('gbk', 'ignore'),
& ]: b3 p/ w4 h - 'abover':abover.encode('gbk', 'ignore') S; t9 j* n1 r, K
- }
& Y- C N5 `: `/ L - res = requests.post(locoy_url, data=locoy_data)
0 f$ c. S9 E1 `, @1 b - print res.text
1 ` T7 Y5 H9 q# D: \6 G; Y - print res.content
# L9 f' v3 [$ g C$ o" c/ K0 q# {# G - # print Dsd2 q! l+ O ~8 C" q" ?4 H& m2 B
- return res
7 r9 L, E v4 ]. q; v -
7 V2 s& `; x4 R1 V i - def __init__(self):" `# f4 z \+ ?5 W1 t7 X9 h
- self.base_url1 = 'https://www.****.cc/'
) m' q' h. @3 A - self.base_url2 = '/'; C& z/ K, \- z8 i3 `
- self.CaterId = []- I0 X4 F' A" f, i2 {! r1 a
- self.CaterIds = ['xuanhuan', 'wuxia', 'yanqing', 'lishi', 'kehuan', 'kongbu', 'nvsheng']- I5 l, S! H* h V, \4 s! q
- self.page_num = 1& m S/ ?$ q; J$ O+ C8 ~
- self.total_num = 200
% L) P- B) c6 Q -
* P& n) D$ L7 R" U - @every(minutes=8 * 60)
3 e2 n! F0 Q" t( P8 V+ o: {& [" H7 O - def on_start(self):
2 c) } @6 Q' K# X% }8 W - global Cater_Name, E3 }5 e% w$ z/ M y
- Cater_Name = []
! X( z* V% n! I8 p" h6 \, t - while self.page_num <= self.total_num: ' ^% c1 ~0 y% l% w7 W
- for self.CaterId in self.CaterIds:
, I v& h5 O' x3 Y" Q - if self.CaterId == 'xuanhuan':
) Q: V; k0 L4 t& V/ {# W - Cater_Name = '玄幻': Z8 G6 A# N, ~# E
- if self.CaterId == 'wuxia':- f/ Q- Z# G- `+ U O I) i
- Cater_Name = '武侠'+ S# R$ n/ u! H6 K4 n4 I
- if self.CaterId == 'lishi':
" X; j* y/ f9 `7 ~% k. c# v" a) v - Cater_Name = '历史' ( K: }4 Y1 a, Q0 F) [4 h( p1 ]# h
- if self.CaterId == 'yanqing':, N8 Z6 m8 [! n6 }6 t; W! L3 r% ?
- Cater_Name = '都市'
4 |4 m( K1 A6 n - if self.CaterId == 'nvsheng':( c# }* s! |' u
- Cater_Name = '都市'
$ C d/ Y5 R5 F - if self.CaterId == 'kehuan':, ^, q; b8 `: `9 k2 p7 Z6 w7 m
- Cater_Name = '科幻'
4 h0 L* T& b# \8 o. {' V1 ?! H/ A - if self.CaterId == 'kongbu':- F. H( N: p" k) }. E$ j: t
- Cater_Name = '游戏'
- r( k, i5 l0 [9 Y! r$ _/ u' a - print self.CaterId; q2 q; t# a2 u# A
- url = self.base_url1 + str(self.CaterId) + self.base_url2 + str(self.page_num) + "/"
& | S d2 M* k% S - self.crawl(url, callback=self.list_Caterg,save=Cater_Name)
, I; o) V3 c( \- t - self.page_num += 1 8 l: S7 A! U; T; A
-
3 A+ U6 n( L& h - def list_Caterg(self, response):: N1 y6 X' `0 [/ J" x0 L0 o$ O
- Cater_Name = response.save2 ` e& h3 w3 E5 j9 ?
- for each in response.doc('.pic-list a[href^="http"]').items():
% s9 V/ d; ~) ]* \: [9 g - self.crawl(each.attr.href, callback=self.list_Caterg_detail,save=Cater_Name)5 w8 c X- U) D# _; p; m* v) X
- - h) w/ N. b5 @4 Q/ ^% k) |8 v
- def list_Caterg_detail(self, response):
4 t4 E6 E. J+ E1 J; ]" [ h$ D& Q - Cater_Name = response.save
5 z2 [$ @) h- a - # print Cater_Name5 z. w" W! n; S, b8 Z+ i3 k
- Bookname = response.doc('h1').text()
5 k2 [* d2 t0 q - print Bookname
5 |0 i& s7 S* x4 D8 J; Q - Book_author = response.doc('.authorname > a').text()8 m, u* e: u% x/ |2 Y$ T7 Z. B
- # print Book_author
9 X* F2 v4 c. U9 y7 u; h - Book_Introduction = response.doc('.book-intro > div').text()1 T! Q+ k! q* h2 J
- # print Book_Introduction0 j0 L$ C* H# Y
- Book_Synopsis = response.doc('b').eq(1).text()$ P# z) l2 f4 e" R
- # print Book_Synopsis! W, }. v9 l" r" ?8 H0 ?
- Book_Palabras = response.doc('.booktitle p').text().split(' ')[1].split('|')[0]
5 e( P- k* i* L+ c% P* i - # print Book_Palabras: \9 y1 i7 w, |+ @
- BookIDs = response.url.split("xiaoshuo/")[-1].split("/")[0] #小说ID
( m l) E1 A) \) ^- n' S$ U/ U - # print BookIDs1 Q% G/ W' s" J4 d- n0 o' G
- Book_Dates = str(datetime.datetime.now()) , Z, k& D/ x& c3 d; G7 ]
- for imgs in response.doc('.bigpic > img[src^="http"]').items():
1 i% L- `$ g! {/ H. D# ?2 a+ c - img = imgs.attr.src
% C) E. ?) S% I1 s - print img
( h& q+ S- i9 d% u3 c r - #小说封面下载
& G! f3 q' k; d' n2 K. u# a - extension = self.getExtension(img)
, ~$ K" l) u0 R- Q. ^+ I - name = self.getname(img)/ ]4 P3 A- K8 F
- file_name = name + "." + extension
/ ~' r: e6 F* v0 ^ - imgDir = P_dir + name
: l- y1 \( w7 Z+ P! ~$ N- P+ H - Locaimg = imgDir + "/" + file_name: t* ~3 v5 C- {4 h# V
- print Locaimg. ]" P0 e, M8 I& ^+ u! Y* x o
- if(self.download(P_dir, imgDir, file_name, img)): #这2行可注译,图片下载到本地- f- I$ K- d4 E" G
- print('attachment url is ' + img) #
( y3 e( t9 B- S: l/ \; y - Datos = {6 W0 W8 l* ~8 w, C) R2 w8 D/ y
- "Cater_Name":Cater_Name,
I. }* h3 n; j5 l, m - "Book_author":Book_author,' x* G1 C' ^/ [. X( ]. t
- "Book_Introduction":Book_Introduction,
8 {7 ~! I7 y% k8 t* O# e9 s - "Book_Synopsis":Book_Synopsis,# ]+ @8 A# E9 E/ k4 ~" G( W9 y, ?
- "Book_Palabras":Book_Palabras,# J. V/ `6 ~0 U3 D" C7 q3 j
- "img":img,$ ~( R1 z; ^* d
- }
! ~5 h& P# A' z4 V' X - self.add_BookFile(Bookname, BookIDs, img, Locaimg, Book_Dates) #这行可注译,数据库发布接口,方便其他系统的发布9 z# f; ]- Y( V1 x$ D
- for each in response.doc('div[class="bookbtn-txt"] a[class="catalogbtn"]').items():# L; U5 [. I h6 W4 P( r3 h% `
- self.crawl(each.attr.href, callback=self.index_page,save=Datos)5 G7 F U: Y" d2 z0 k0 u
- # a( D" l1 ~& A6 c3 H
- @config(age=8 * 60 * 60) 5 k$ W+ m6 R3 t4 n6 r: ^) j
- def index_page(self, response): 6 j8 W& J x h7 W, k7 l. u
- Datos = {
1 x$ a z0 N1 D& u - "Cater_Name":response.save['Cater_Name'],
( E% B$ |' `8 B$ z) M" K( P$ q - "Book_author":response.save['Book_author'],
2 V) x. }! ?0 V - "Book_Introduction":response.save['Book_Introduction'],$ l# f7 W0 t8 \1 P+ R
- "Book_Synopsis":response.save['Book_Synopsis'],
8 w2 W* `/ a2 k( s. b' _ - "Book_Palabras":response.save['Book_Palabras'],5 ^6 w# o. u. @, z- F
- "img":response.save['img'],
. F. \: v9 r7 G$ u( } - }
# S1 Y) B c; H- S: C( | - for each in response.doc('.chapter-list li:first-child a[href^="http"]').items():
& d* b8 C' D; ~8 z) ]( y) b - # for each in response.doc('.chapter-list a[href^="http"]').items():
$ g$ b! s+ q% O6 l8 @ - self.crawl(each.attr.href, callback=self.detail_page,save=Datos)3 M" v) x, q( @( B; V! a
- @config(priority=2)1 ^9 I9 T; K. w8 y* q0 E
- @catch_status_code_error
3 h% T( e% T* q; _% f% R - def detail_page(self, response):
9 X& `5 P) M) [- ]; o& y" Z - NewRe1 = u'哈书'# s- L' V% q8 x2 w( ~4 I8 }( `
- NewRe2 = u'huhjsd.CC'' o5 m) Z' T" ^1 Y! B' v! B
- NewRe3 = r'^\\n\\n'+ I5 x9 O5 E; l7 [
- NewRe5 = u'小说网'
/ P) W; n7 l$ u+ @3 ^ - NewRe6 = u'fgdfgf'3 C: I" w* @4 B* N8 D
- NewRe7 = u'fgfgf'4 O! \! b! M) X R: M4 {+ g
- NewRe8 = u'ffhgf'
7 O; O# U: h% B V, F$ S - NewRe4 = r'[\f\t\v+\.\{\(\)\}\!\/_,$%^*(+"\')]+|[+——()?【】“”!,。?、~@#¥%……&*()]+'
( v7 B/ L1 r; p$ F- f& {; _$ V - ReC1 = u'静思'+ z) q% S, K' p+ o4 m$ g
- ReC2 = u'aghgf.com'; F @6 d! A7 ^# A& c- e& @
- ReC3 = u'aghgfh.com'- v7 ?4 v2 P) e0 Z5 Y( ], U+ Z
- ReC4 = u''" S1 E+ W5 H% }. u- I2 _9 @6 X
- ReC5 = u'文学网'
) _1 O7 x" ?2 i9 O& o, C( V - ReC6 = r'<BR>'
0 {4 }$ q4 G2 N0 }( c5 L& i8 G5 ~ - Bookname = response.doc('.readlocation a').eq(2).text() #小说名称1 Y% T. v3 j+ I6 r
- print Bookname
4 r' m v5 a+ e - Cater_Name = response.save['Cater_Name'] # 小说分类
; }$ |; Z5 R5 {1 L. a! e - Book_author = response.save['Book_author'] #小说作者, `; ~+ ?$ M; \& G
- Book_Introduction1 = response.save['Book_Introduction'] #小说简介
' u# L% V; ]7 ?0 Q - Book_Synopsis = response.save['Book_Synopsis'] #最近更新5 }3 \ {% I5 V' X( F1 l
- Book_Palabras = response.save['Book_Palabras'] #小说字数' [ e' u Y& l; U$ J, U. U% k
- Bookurl = response.url #小说网址 i$ ~& n$ M! {2 ], Y: R
- Booktitle = response.doc('.article-title').text() #章节名称
+ P7 k( C5 N1 ?3 N2 L& G% B - BookID = response.doc('.readset-r span').text() #小说ID0 o3 ]2 I q8 y7 ~
- BookConte1 = response.doc('.article-con').text() #小说章节内容$ n" P% C+ D6 l, P6 e
- abover = response.doc('.article-title').text() + response.save['Book_Synopsis'] + response.save['Book_Palabras'] + response.save['Book_Introduction'] #小说状态(连载还是完成), v5 i0 i9 r- @- F
- Book_Date = str(datetime.datetime.now()) # 采集时间
9 g! N0 g. g( N( N) {+ Y, L3 s# Z - BookConte2 = BookConte1.replace(NewRe1 , ReC1)
) h4 C7 Y; I, \& ?) m - BookConte3 = BookConte2.replace(NewRe2 , ReC2)
! j; W V1 y+ g4 ?" S- f% n" } - BookConte5 = BookConte3.replace(NewRe5 , ReC5) y/ a6 z$ ?0 k9 O9 P& T
- BookConte6 = BookConte5.replace(NewRe6 , ReC2)
" i( g0 i$ Q2 e% y# f& B" y5 H5 [. ] - BookConte7 = BookConte6.replace(NewRe7 , ReC2)3 u8 g# r3 G) I* Q+ a/ _( e6 n
- BookConte8 = BookConte7.replace(NewRe3 , ReC6); r/ D' ]; B) C1 ~6 L$ Q
- BookConte4 = re.sub(NewRe4 , ReC4 , BookConte8)
) @1 d( J3 ]0 @) ^; s - BookConte = BookConte4.replace("\n\n","<br>")' ?1 O, Y. J3 } @, \& z
- print BookConte
9 `! `; E# m; N2 Z, K* S - Book_Introduction2 = Book_Introduction1.replace(NewRe1 , ReC1). Q3 \4 m+ T, o9 y$ W
- Book_Introduction3 = Book_Introduction2.replace(NewRe2 , ReC2)
% z7 _" V+ U8 o" S5 \, [# H; `- o - Book_Introduction4 = Book_Introduction3.replace(NewRe3 , ReC3)* b) D( ~7 I* o5 T+ x2 f3 E
- Book_Introduction = re.sub(NewRe4 , ReC4 , Book_Introduction4)
# q1 a! I: N! }3 t - Titleid = response.url.split(BookID + "/")[-1].split("/")[0]
: V# W3 s5 h' T- @4 e - Book_img = response.save['img'], #小说图片1 M% ~4 Z# j g7 m$ n" q. a
-
3 N E7 U1 F- p# @. b4 a" g* o, L - #insert into MySQL 小说入库 D1 k, G- e9 u
- self.add_question(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Synopsis,Book_Palabras,Bookurl,Booktitle,BookID,BookConte,Titleid,abover,Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
; I6 l; u" H. N3 q. e1 e- c - self.add_comment(Bookname, Booktitle, BookID, Titleid, Book_Date) #这行可注译,数据库发布接口,方便其他系统的发布
3 V% X6 l4 `# X+ C0 }/ E6 E7 R2 l - #post提交发布
- C6 b6 U5 H7 @" L6 w - self.add_locoy(Bookname,Cater_Name,Book_author,Book_Introduction,Book_Palabras,Book_img,Booktitle,BookConte,abover) #这行可注译,火车头发布接口,不需要可取消+ p2 {$ {' w# |) p
- Datos = {) F3 U1 i3 q) q2 u" ~( M7 g, G
- "Cater_Name":response.save['Cater_Name'],0 [8 U: w1 ?1 M8 f* w9 D
- "Book_author":response.save['Book_author'],
7 p0 y- a l; i0 ~ - "Book_Introduction":response.save['Book_Introduction'],* _8 w8 a$ ~5 W% i% k/ J
- "Book_Synopsis":response.save['Book_Synopsis'],, U" g: I7 I6 y3 ^" N0 d# d
- "Book_Palabras":response.save['Book_Palabras'],
9 u t! ^9 \7 X# e - "img":response.save['img'],* C% m- `, n' X2 w
- }
5 i/ q8 c; Y+ N5 Y - for each in response.doc('.articlebtn > a:nth-child(4)[href*="/xiaoshuo"]').items():
% Z1 I% o% o/ r5 { - self.crawl(each.attr.href, callback=self.detail_page,save=Datos) 3 B* `3 _( |6 o2 O% O
- return {
- m, o/ v. X/ ~! F - "Cater_Name":Cater_Name,9 m$ Q; e# L- u f6 Y
- "Bookname":Bookname,# c O" `8 F6 g
- "Book_author":Book_author,
$ l9 R) t" O/ I9 u; B4 F - "Book_Introduction":Book_Introduction,
, z" l0 B1 D9 F - "Book_Synopsis":Book_Synopsis,
' X+ o N* O+ n% n- G5 i9 h$ L - "Book_Palabras":Book_Palabras,% y7 L8 F5 @; E. c1 U
- "Book_img":Book_img,
$ @5 T+ p& `# B - "Bookurl": response.url,
! m& ]% b" g/ O& h8 G - "Booktitle": Booktitle,# R% O8 d" h, f
- "BookID": BookID,
( C3 ^% O& h6 M - "BookConte": BookConte,; U- H# N; b' Z" q. X' J
- "Titleid": Titleid,. O4 p+ P1 r/ o3 [" M5 {6 F
- "abover":abover,
6 b/ m- k% I2 O" O - # "Book_Date" = str(datetime.datetime.now()),# E6 j& M, n+ y6 [9 D$ K
- }
! d! f0 P9 ^ [$ L7 k - def download(self, P_dir, imgDir, file_name, Book_img):( \: p- Y) B1 D* T2 z# s# R Q
- if not os.path.exists(imgDir):
0 i! Y4 ?! a2 ^% B5 _/ Y - os.makedirs(imgDir)# X2 x- B3 h7 t# m
- file = imgDir + "/" + file_name) L1 A; X% a9 X2 ]6 w
- # print file
7 |# O, E9 ~9 p6 m3 I - f = open(file, 'wb+')
+ A; f2 _( K3 k! N9 \ - imag = requests.get(Book_img)
?7 s2 k! C7 Y - f.write(imag.content)3 r0 V' \7 H# ?
- f.close()
- x/ j+ a- ~9 Y# Q" o. V - #保存图片前& b: g4 ?$ p4 k( g. I! ?
- def save_imgs(self,response):
6 H' r* A9 I4 w) P2 k2 }( t' M - content = response.content
e2 K2 ?4 v- }( u( u& t - file_name = response.save["file_name"]
& D# B+ U S8 Z4 X - imgDir = response.save["imgDir"]
+ L& H- n. z! g, M - file_path = imgDir + file_name
9 W2 }- R- a# |# A0 t7 F: f& f - self.save_img(content,imgDir,file_path)* c, R/ X5 B O& U. B. T+ _
- #保存图片
1 l9 W' C+ C6 e4 Y; U- L - def save_img(self,content,imgDir,path):) R; Z; x( \2 U# U
- if not os.path.exists(imgDir): : M j' S! P' l5 ?6 L
- os.makedirs(imgDir)
z1 ?+ h$ ?- Z# F( c4 K: C - f = open(path,"wb" )$ d4 I; n9 l5 e+ I. ~( {
- f.write(content)$ V$ a! T$ O: i; o- E
- f.close()- y7 l% e/ j3 b, |, X; m
- #获取url后缀名3 J; k7 `) ?& Z9 V1 K8 c1 w
- def getExtension(self,url): # I+ I; M6 d" C: ?" k
- extension = url.split(".")[-1]! u6 c f+ r5 N T# T
- return extension
; J5 }! I* J; C: o - : o m' c* ~3 E! O! Z
- #获取图片名% F3 m. t& o% B
- def getname(self,url):
. G0 B5 T/ d: { - name=url.split("/")[-1].split(".")[0]6 D* [) W. N" Z6 ]( N. Z @
- return name
复制代码 8 V8 Z5 l: U- D! g
* H3 l* U$ e- y) V. } |