用 Python 打造个人小说库

Posted 2025-03-27 Updated 2025-03- 27

By power 已删除用户

19~24 min read

Python实现全网小说下载：轻松获取你喜欢的网络小说！

大家好，今天给大家推荐一个非常实用的Python项目——全网小说下载器。通过这个项目，你可以轻松下载你喜欢的网络小说，并且支持搜索功能，输入小说名字或作者即可下载相应的小说内容。无论是追更还是收藏，这个工具都能满足你的需求。

项目亮点

系统分析网页性质：通过分析目标网站的网页结构，精准定位小说内容。
结构化的数据解析：使用parsel库解析网页数据，提取小说章节和内容。
函数式编程：代码结构清晰，功能模块化，易于理解和扩展。
下载进度条显示：使用tqdm库显示下载进度，提升用户体验。

环境介绍

Python 3.6：推荐使用Python 3.6及以上版本。
Pycharm 编辑器：专业版需要激活码，社区版免费使用。
依赖库：
- requests：用于发送HTTP请求。
- parsel：用于解析HTML数据。
- pandas：用于数据处理和展示。
- tqdm：用于显示下载进度条。

代码实现过程

1. 数据来源分析

首先，我们需要确定需求：爬取笔趣阁小说内容，并实现搜索功能。通过分析目标网站的网页结构，我们可以找到小说章节列表页和小说内容页的URL。

2. 代码实现

接下来，我们来看一下代码的具体实现。

2.1 发送请求

我们使用requests库发送HTTP请求，获取网页的HTML源代码。

import requests

def get_response(html_url):
    '''发送请求'''
    response = requests.get(url=html_url, headers=headers)
    response.encoding = response.apparent_encoding
    return response

2.2 解析数据

使用 parsel 库解析 HTML 数据，提取小说章节的 URL 和内容。

import parsel

def get_novel_url(html_url):
    """获取小说章节url"""
    response = get_response(html_url)
    selector = parsel.Selector(response.text)
    name = selector.css('#info h1::text').get()
    href = selector.css('#list dd a::attr(href)').getall()
    href = ['https://www.mayiwsk.com/' + i for i in href]
    for index in tqdm(href):
        response = get_response(index)
        selector = parsel.Selector(response.text)
        title = selector.css('.bookname h1::text').get()
        content_list = selector.css('#content::text').getall()
        content = ''.join(content_list)
        save(name, title, content)

2.3 保存数据

将下载的小说内容保存到本地文件中。 python

def save(name, title, content):
    """保存小说"""
    with open(name + '.txt', mode='a', encoding='utf-8') as f:
        f.write(title)
        f.write('n')
        f.write(content)
        f.write('n')

2.4 实现搜索功能

通过输入小说名字或作者，搜索并下载相应的小说。

while True:
    print('输入0即可退出程序')
    word = input('请输入你要下载的小说名字(作者): ')
    if word == '0':
        break
    search_url = 'https://www.mayiwsk.com/modules/article/search.php'
    data = {
        'searchkey': word,
        'searchtype': 'articlename'
    }
    response_1 = requests.post(url=search_url, data=data, headers=headers, verify=False)
    response_1.encoding = response_1.apparent_encoding
    selector_1 = parsel.Selector(response_1.text)
    trs = selector_1.css('#nr')
    lis = []
    if trs:
        for tr in trs:
            novel_name = tr.css('td:nth-child(1) a::text').get()
            novel_id = tr.css('td:nth-child(1) a::attr(href)').get().replace('/', '')
            author = tr.css('td:nth-child(3)::text').get()
            dit = {
                '书名': novel_name,
                '作者': author,
                '书ID': novel_id,
            }
            lis.append(dit)
        print(f'一共搜索到{len(lis)}数据内容')
        search_result = pd.DataFrame(lis)
        print(search_result)
        while True:
            try:
                num = input('请输入你要下载的小说序号(输入 r 重新搜索): ')
                if num.lower() == 'r':
                    break
                num = int(num)
                if 0 <= num < len(lis):
                    num_id = lis[num]['书ID']
                    link_url = f'https://www.mayiwsk.com/{num_id}/index.html'
                    if get_novel_url(link_url):
                        print(f"{lis[num]['书名']}已经下载完成了")
                    break
                else:
                    print(f"序号必须在 0-{len(lis)-1} 之间")
            except ValueError:
                print("请输入有效的序号或 'r' 重新搜索")
    else:
        print('抱歉，搜索没有结果^_^')

总结

最后贴上完整代码

import requests  
import parsel   
import pandas as pd  
from tqdm import tqdm  
import urllib3  


urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
    'Cookie': 'Hm_lvt_388935f17c59007418b478114fe7cb71=1742448451; HMACCOUNT=243404375A72A8C8; Hm_lpvt_388935f17c59007418b478114fe7cb71=1742448519',
    'referer': 'https://www.mayiwsk.com/modules/article/search.php'
}

def get_response(html_url):
    '''发送HTTP请求并获取响应'''
    response = requests.get(url=html_url, headers=headers, verify=False)  
    response.encoding = response.apparent_encoding  
    return response

def save(name, title, content):
    """保存小说内容到文本文件
    Args:
        name: 小说名称
        title: 章节标题
        content: 章节内容
    """
    with open(name + '.txt', mode='a', encoding='utf-8') as f:
        f.write(title)
        f.write('n')
        f.write(content)
        f.write('n')

def get_novel_url(html_url):
    """获取并下载小说所有章节内容
    Args:
        html_url: 小说目录页面URL
    """
    response = get_response(html_url)
    selector = parsel.Selector(response.text)  


    name = selector.css('#info h1::text').get()
    href = selector.css('#list dd a::attr(href)').getall()
    href = ['https://www.mayiwsk.com/' + i for i in href]  


    for index in tqdm(href):
        response = get_response(index)
        selector = parsel.Selector(response.text)
        title = selector.css('.bookname h1::text').get()  
        content_list = selector.css('#content::text').getall()  
        content = ''.join(content_list)  
        save(name, title, content)  

if __name__ == '__main__':
    while True:
        print('输入0即可退出程序')
        word = input('请输入你要下载的小说名字(作者): ')
        if word == '0':
            break


        search_url = 'https://www.mayiwsk.com/modules/article/search.php'
        data = {
            'searchkey': word,
            'searchtype': 'articlename'
        }


        response_1 = requests.post(url=search_url, data=data, headers=headers, verify=False)
        response_1.encoding = response_1.apparent_encoding
        selector_1 = parsel.Selector(response_1.text)


        trs = selector_1.css('#nr')  
        lis = []
        if trs:  

            for tr in trs:
                novel_name = tr.css('td:nth-child(1) a::text').get()  
                novel_id = tr.css('td:nth-child(1) a::attr(href)').get().replace('/', '')  
                author = tr.css('td:nth-child(3)::text').get()  
                dit = {
                    '书名': novel_name,
                    '作者': author,
                    '书ID': novel_id,
                }
                lis.append(dit)


            print(f'一共搜索到{len(lis)}数据内容')
            search_result = pd.DataFrame(lis)
            print(search_result)


            while True:
                try:
                    num = input('请输入你要下载的小说序号(输入 r 重新搜索): ')
                    if num.lower() == 'r':  
                        break
                    num = int(num)
                    if 0 <= num < len(lis):  
                        num_id = lis[num]['书ID']  
                        link_url = f'https://www.mayiwsk.com/{num_id}/index.html'
                        get_novel_url(link_url)  
                        print(f"{lis[num]['书名']}已经下载完成了")
                        break
                    else:
                        print(f"序号必须在 0-{len(lis)-1} 之间")
                except ValueError:
                    print("请输入有效的序号或 'r' 重新搜索")
        else:
            print('抱歉，搜索没有结果^_^')

知识库

License: CC BY 4.0