最近在微信里看了一个小说叫《阴阳代理人》的,看到一半,发现断了,作者说把后面的部分放到了百度贴吧,去了贴吧发现,文章看起来比较费劲,乱糟糟的,所以为了我的小说,弄个了爬虫,去给我弄下来。
#!/user/bin/env python
# -*- coding:utf-8 -*-
import urllib2
import urllib
import re
#小说章节类
class ZHANGJIE:
#初始化传入地址
def __init__(self,zjUrl,Num):
self.zjUrl = zjUrl
self.Num = Num
#传入页码,获得网页源代码,只看精品
def getPage(self):
url = self.zjUrl + str(self.Num)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
#获取每章链接资源
def getNum(self):
webcon = self.getPage()
Num = re.findall('<a href="/p/(\d+)"',webcon)
global Num_r
Num_r = Num[::-1]
return Num_r
#每题内容类
class NEIRONG:
#初始化,传入地址
def __init__(self,nrUrl,Num):
self.nrUrl = nrUrl
self.Num = Num
#传入资源页码,获得网页源代码,只看楼主
def getPage(self):
url = self.nrUrl + str(self.Num) + '?see_lz=1'
request = urllib2.Request(url)
response = urllib2.urlopen(request)
return response.read()
#获取小说章节和标题
def getTitle(self):
webcon = self.getPage()
title = re.search('>(.*)</h2>',webcon).group(1)
return title
#获得小说内容并打印换行
def getCon(self):
webcon = self.getPage()
try:
con = re.search('支持兰大,一定要记得投票哦!推荐票!(.*)(</div><br>)',webcon).group(1)
con_n = re.sub('<br>','\n',con)
return con_n
except AttributeError:
return '广告内容,已经忽略'
#写入文件
def writeDate(self):
con_t = self.getTitle()
con_n = self.getCon()
try:
with open('/tmp/yydlr.txt','a') as f:
f.write('\n')
f.write('\n')
f.write(con_t)
f.write(con_n)
except IOError:
print '写入异常'
for i in range(3050,1650,-50):
ZJ = ZHANGJIE('http://tieba.baidu.com/f/good?kw=%E9%98%B4%E9%98%B3%E4%BB%A3%E7%90%86%E4%BA%BA&ie=utf-8&cid=0&pn=',i)
ZJ.getNum()
for i in Num_r:
NR = NEIRONG('http://tieba.baidu.com/p/',i)
NR.writeDate()
print '小说已经准备好啦'
亿速云「云服务器」,即开即用、新一代英特尔至强铂金CPU、三副本存储NVMe SSD云盘,价格低至29元/月。点击查看>>
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。