分类导航

程序笔记发布时间：2022-07-21 发布网站：大佬教程 code.js-code.com

大佬教程收集整理的这篇文章主要介绍了用python实现word文档转markdown，大佬教程大佬觉得挺不错的，现在分享给大家，也给大家做个参考。

# -*- coding: UTF-8 -*-

'''
作者：zhangdongyu

简介：把word文档转为markdown文档

原理：
    1. 根据Word文字颜色判断是否为标题、行代码
    2. 根据Word文字是否加粗、倾斜进行加粗和倾斜判定
    3. 根据Word“项目符号/编号”判断是否为列表
    4. 根据Word 1x1的表格判断是否为块代码
    5. 当然也可以根据字体大小、是否倾斜、或者其它指标判断是否是标题、行代码、块代码，自己按需修改源码实现

注意：文档中的图片无法处理

参考资料：
    [python-docx官方文档](https://python-docx.readthedocs.io/en/latest/index.html)
    [第105天： Python 操作 Word](http://www.ityouknow.com/python/2019/12/31/python-word-105.html)
    [python-docx处理word文档](https://zhuanlan.zhihu.com/p/61340025)
    [Python顺序读取word文档中的文本与表格](https://blog.csdn.net/qq_39600166/article/details/101537368)

配置文件示例（json格式，Windows下需要两个反斜杠）：
    配置说明：
        head_1_color：一级标题颜色（7030A0：紫色）
        head_2_color：二级标题颜色（0070C0：蓝色）
        head_3_color：三级标题颜色（00B050：绿色）
        head_4_color：四级标题颜色（C55A11：橙色）
        head_5_color：五级标题颜色（FF66CC：粉色）
        line_code_color：行代码颜色（C00000：红色）
        head_1_sharp_num：一级标题几个#号，后续标题#个数依次递增
        mode：1：处理单个docx文档，2：处理目录下的所有docx文档
        src_path：docx文档路径（模式1），或目录路径（模式2）
                  模式1：e.g. D:\\下载\\test.docx
                  模式2：e.g. D:\\下载\\test
        save_path：markdown文件保存路径
                  模式1：e.g. D:\\下载\\test.md
                  模式2：e.g. D:\\下载\\test_R_814_11845@d

    单docx转markdown：
    {"head_1_color":"7030A0", "head_2_color":"0070C0", "head_3_color":"00B050", 
    "head_4_color":"C55A11", "head_5_color":"FF66CC", "line_code_color":"C00000", 
    "head_1_sharp_num":2, "mode":1, "src_path":"D:\\下载\\test.docx", "save_path":"D:\\下载\\test.md"}

    一个目录下的所有docx转markdown：
    {"head_1_color":"7030A0", "head_2_color":"0070C0", "head_3_color":"00B050", 
    "head_4_color":"C55A11", "head_5_color":"FF66CC", "line_code_color":"C00000", 
    "head_1_sharp_num":2, "mode":2, "src_path":"D:\\下载\\test", "save_path":"D:\\下载\\test_R_814_11845@d"}
'''

import os
import shutil
import glob
import json
import docx
from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.shared import RGBColor


def iter_block_items(parent):
    '''
    Yield each paragraph and table child within *parent*, in document order.
    Each returned value is an instance of either Table or Paragraph. *parent*
    would most commonly be a reference to a main Document object, but
    also works for a _Cell object, which itself can contain paragraphs and tables.
    '''
    if isinstance(parent, Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_p):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)


def write_paragraph(block, f_md):
    text = ''

    # 通过颜色判断段落是否为标题（最多支持五级标题）
    flag = ''
    if len(block.runs) > 0:
        rgb = block.runs[0].font.color.rgb
        if rgb == RGBColor(eval('0x' + h1c[0:2]), eval('0x' + h1c[2:4]), eval('0x' + h1c[4:6])):  # 一级标题
            flag = 'head 1'
        elif rgb == RGBColor(eval('0x' + h2c[0:2]), eval('0x' + h2c[2:4]), eval('0x' + h2c[4:6])):  # 二级标题
            flag = 'head 2'
        elif rgb == RGBColor(eval('0x' + h3c[0:2]), eval('0x' + h3c[2:4]), eval('0x' + h3c[4:6])):  # 三级标题
            flag = 'head 3'
        elif rgb == RGBColor(eval('0x' + h4c[0:2]), eval('0x' + h4c[2:4]), eval('0x' + h4c[4:6])):  # 四级标题
            flag = 'head 4'
        elif rgb == RGBColor(eval('0x' + h5c[0:2]), eval('0x' + h5c[2:4]), eval('0x' + h5c[4:6])):  # 五级标题
            flag = 'head 5'

        # 拼接段落文本
        if flag != '':
            for run in block.runs:
                text += run.text
        else:
            for run in block.runs:
                if run.font.color.rgb == RGBColor(eval('0x' + lcc[0:2]),
                                                  eval('0x' + lcc[2:4]),
                                                  eval('0x' + lcc[4:6])):  # 行代码
                    text += '`' + run.text + '`'
                elif run.font.cs_bold:  # 加粗
                    text += '**' + run.text + '** '
                elif run.font.cs_italic:  # 斜体
                    text += '*' + run.text + '* '
                else:
                    text += run.text

        type = block.style.name
        if type == 'Normal':  # 普通文本
            pass
        elif type == 'List Paragraph':  # 项目符号/编号
            text = '- ' + text
        else:
            pass

    # 文本写入文件
    if flag == 'head 1':
        f_md.write('#' * (h1n + 0) + ' ' + text + 'n')
    elif flag == 'head 2':
        f_md.write('#' * (h1n + 1) + ' ' + text + 'n')
    elif flag == 'head 3':
        f_md.write('#' * (h1n + 2) + ' ' + text + 'n')
    elif flag == 'head 4':
        f_md.write('#' * (h1n + 3) + ' ' + text + 'n')
    elif flag == 'head 5':
        f_md.write('#' * (h1n + 4) + ' ' + text + 'n')
    else:
        f_md.write(text + 'n')


def write_table(block, f_md):
    f_md.write('```' + 'n')
    for i in range(len(block.rows)):
        f_md.write(block.cell(i, 0).text + 'n')
    f_md.write('```' + 'n')


def docx_2_markdown(docx_path, md_save_path):
    f_md = open(file=md_save_path, mode='wt', encoding='utf-8')
    doc = docx.Document(docx_path)
    for block in iter_block_items(doC):
        if isinstance(block, Paragraph):
            write_paragraph(block, f_md)
        elif isinstance(block, TablE):
            write_table(block, f_md)
    f_md.close()


if __name__ == '__main__':
    msg = '''
作者：zhangdongyu

简介：把word文档转为markdown文档

原理：
    1. 根据Word文字颜色判断是否为标题、行代码
    2. 根据Word文字是否加粗、倾斜进行加粗和倾斜判定
    3. 根据Word“项目符号/编号”判断是否为列表
    4. 根据Word 1x1的表格判断是否为块代码
    5. 当然也可以根据字体大小、是否倾斜、或者其它指标判断是否是标题、行代码、块代码，自己按需修改源码实现

注意：文档中的图片无法处理

参考资料：
    [python-docx官方文档](https://python-docx.readthedocs.io/en/latest/index.html)
    [第105天：Python操作Word](http://www.ityouknow.com/python/2019/12/31/python-word-105.html)
    [python-docx处理word文档](https://zhuanlan.zhihu.com/p/61340025)
    [Python顺序读取word文档中的文本与表格](https://blog.csdn.net/qq_39600166/article/details/101537368)

配置文件示例（json格式，Windows下需要两个反斜杠）：
    配置说明：
        head_1_color：一级标题颜色（7030A0：紫色）
        head_2_color：二级标题颜色（0070C0：蓝色）
        head_3_color：三级标题颜色（00B050：绿色）
        head_4_color：四级标题颜色（C55A11：橙色）
        head_5_color：五级标题颜色（FF66CC：粉色）
        line_code_color：行代码颜色（C00000：红色）
        head_1_sharp_num：一级标题几个#号，后续标题#个数依次递增
        mode：1：处理单个docx文档，2：处理目录下的所有docx文档
        src_path：docx文档路径（模式1），或目录路径（模式2）
                  模式1：e.g. D:\\下载\\test.docx
                  模式2：e.g. D:\下载\test
        save_path：markdown文件保存路径
                  模式1：e.g. D:\\下载\\test.md
                  模式2：e.g. D:\下载\test_R_814_11845@d

    单docx转markdown：
    {"head_1_color":"7030A0", "head_2_color":"0070C0", "head_3_color":"00B050", 
    "head_4_color":"C55A11", "head_5_color":"FF66CC", "line_code_color":"C00000", 
    "head_1_sharp_num":2, "mode":1, "src_path":"D:\\下载\\test.docx", "save_path":"D:\\下载\\test.md"}
    
    一个目录下的所有docx转markdown：
    {"head_1_color":"7030A0", "head_2_color":"0070C0", "head_3_color":"00B050", 
    "head_4_color":"C55A11", "head_5_color":"FF66CC", "line_code_color":"C00000", 
    "head_1_sharp_num":2, "mode":2, "src_path":"D:\\下载\\test", "save_path":"D:\\下载\\test_R_814_11845@d"}
    '''
    print(msg)

    config = input("input config json content (copy and modify from above config examples):n")
    config.Strip(' ')
    config = json.loads(config)

    h1c = config["head_1_color"]
    h2c = config["head_2_color"]
    h3c = config["head_3_color"]
    h4c = config["head_4_color"]
    h5c = config["head_5_color"]
    lcc = config["line_code_color"]
    h1n = config["head_1_sharp_num"]
    mode = config["@H_798_6@mode"]
    src_path = config["src_path"]
    save_path = config["save_path"]

    if mode == 1:
        docx_2_markdown(src_path, save_path)
    if mode == 2:
        # 创建目录
        if os.path.exists(save_path):
            shutil.rmtree(save_path)
        os.makedirs(save_path)

        # 列出所有docx文档
        docx_files = glob.glob(os.path.join(src_path, '*.docx'))

        # docx 2 markdown
        for docx_file in docx_files:
            md_file = os.path.join(save_path, os.path.splitext(os.path.basename(docx_filE))[0] + '.md')
            docx_2_markdown(docx_file, md_filE)

    print('nDone!')

    input('Press any key to exit!')

大佬总结

以上是大佬教程为你收集整理的用python实现word文档转markdown全部内容，希望文章能够帮你解决用python实现word文档转markdown所遇到的程序开发问题。

如果觉得大佬教程网站内容还不错，欢迎将大佬教程推荐给程序员好友。

本图文内容来源于网友网络收集整理提供，作为学习参考使用，版权属于原作者。
如您有任何意见或建议可联系处理。小编QQ：384754419，请注明来意。

标签：list load php zhihu 文档源码程序员

上一篇: （四）.netcore+vue 项目搭建下一篇:mysql隔离级别的实现

猜你在找的程序笔记相关文章

You can't specify target table 'xxx' for update in FROM clause的解决 2022-07-21
【UNIAPP】上传视频，进度条的前台与后端 2022-07-21
十款代码表白特效，一个比一个浪漫！ 2022-07-04
作业3 2022-07-06
linux系统下部署项目访问报404错误的解决方法 2022-05-30
C++带有指针成员的类处理方式详解 2022-06-07
Linux——（1）基本命令 2022-07-21
JavaScript之正则表达式学习笔记 2019-11-07
Redux源码学习笔记 2019-11-07
Webpack学习笔记 2019-11-07