auto-solution/scripts/_dump_paragraphs.py

# -*- coding: utf-8 -*-
import zipfile
from pathlib import Path
from xml.etree import ElementTree as ET

NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
W_P = "{%s}p" % NS["w"]
W_T = "{%s}t" % NS["w"]


def para_text(p):
    parts = []
    for t in p.iter(W_T):
        if t.text:
            parts.append(t.text)
        if t.tail:
            parts.append(t.tail)
    return "".join(parts)


def dump(docx_name, out_txt):
    root_dir = Path(__file__).resolve().parents[1]
    z = zipfile.ZipFile(root_dir / docx_name)
    root = ET.fromstring(z.read("word/document.xml"))
    body = root.find("w:body", NS)
    lines = []
    idx = 0
    for c in body:
        if c.tag != W_P:
            continue
        t = para_text(c).replace("\n", " ").strip()
        lines.append(f"{idx}\t{t[:200]}")
        idx += 1
    (root_dir / out_txt).write_text("\n".join(lines), encoding="utf-8")
    print("wrote", idx, "paragraphs to", out_txt)


if __name__ == "__main__":
    dump("xxx控制子系统软件需求规格说明书.docx", "_srs_paragraphs_index.txt")
    dump("xxx控制子系统软件设计说明书.docx", "_sdd_paragraphs_index.txt")