本文共 3151 字,大约阅读时间需要 10 分钟。
小说网站:
第一章内容: 可以发现相对于正常情况下,该网站一个章节分了几页,这增加了一点小麻烦。。。 不过我们可以先查看源码,右键选择查看源码即可。 重要信息皆已被圈出来,我们只需要获取“下一页”,即<div .class=“read-page”>中<a .href="/html/17516/12036575_2.html"> 代码:import requestsfrom bs4 import BeautifulSoupimport sysclass download_content(object): def __init__(self): self.server = "https://www.xs880.com/" self.target = "https://www.xs880.com/html/17516/12036575.html" self.target2 = "https://www.xs880.com/html/17516.html" self.names = [] self.nums = [] self.urls = [] self.urls2 = [] def get_download_url(self): #每一章开头的链接 req = requests.get(url=self.target2) req.encoding = 'gbk' html = req.text div_bf = BeautifulSoup(html, 'lxml') div = div_bf.find_all('ul', class_='tlist') a_bf = BeautifulSoup(str(div[0]), 'lxml') a = a_bf.find_all('a') print(a) self.nums = len(a[:]) # 剔除不必要的章节,并统计章节数 print(self.nums) for each in a[:]: #self.names.append(each.string) self.urls2.append(self.server + each.get('href')) self.urls2.reverse() print(self.urls2) #print(self.names) def get_urls(self): #一章小说 4页的链接 self.urls.append(self.target) print("进入循环前:",self.urls) while self.urls[len(self.urls)-1] != "https://www.xs880.com//html/17516.html": #req = requests.get(url=tgt) now_len = len(self.urls) req = requests.get(url=self.urls[now_len - 1]) req.encoding = 'gbk' html = req.text divbf = BeautifulSoup(html, 'lxml') tit = divbf.find_all('div', class_='article-title mt10') tit_bf = BeautifulSoup(str(tit[0]), 'lxml') b = tit_bf.find_all('h1') b1 = b[0].text.replace('',' ') print(b1) self.names.append(b1) div = divbf.find_all('div', class_='read-page') a_bf = BeautifulSoup(str(div[0]), 'lxml') a = a_bf.find_all('a') each = a[2] self.urls.append(self.server + each.get('href')) print(self.urls[len(self.urls)-1]) print(self.urls) print(len(self.urls)) def get_contents(self,target): req = requests.get(url=target) req.encoding = 'gbk' html = req.text bf = BeautifulSoup(html, 'lxml') texts = bf.find_all('div', class_="size_1") texts = texts[0].text.replace('\xa0' * 8, '\n\n') return texts def writer(self, name, path, text): write_flag = True with open(path, 'a', encoding='utf-8') as f: f.write(name + '\n') f.writelines(text) f.write('\n\n')if __name__ == "__main__": dl = download_content() dl.get_download_url() dl.get_urls() #dl.get_download_url() print('《无人生还》开始下载:') for i in range(len(dl.urls)-1): dl.writer(dl.names[i], 'D:\\deng\\无人生还.txt', dl.get_contents(dl.urls[i])) sys.stdout.write(" 已下载:%.3f%%" % float(i / len(dl.urls)) + '\r') print(" 已下载:%.3f%%" % float(i / len(dl.urls)) + '\r') sys.stdout.flush() print('《无人生还》下载完成')
参考:
转载地址:http://isfef.baihongyu.com/