# -*- coding: utf-8 -*- import zipfile from pathlib import Path from xml.etree import ElementTree as ET NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} W_P = "{%s}p" % NS["w"] W_T = "{%s}t" % NS["w"] def para_text(p): parts = [] for t in p.iter(W_T): if t.text: parts.append(t.text) if t.tail: parts.append(t.tail) return "".join(parts) def dump(docx_name, out_txt): root_dir = Path(__file__).resolve().parents[1] z = zipfile.ZipFile(root_dir / docx_name) root = ET.fromstring(z.read("word/document.xml")) body = root.find("w:body", NS) lines = [] idx = 0 for c in body: if c.tag != W_P: continue t = para_text(c).replace("\n", " ").strip() lines.append(f"{idx}\t{t[:200]}") idx += 1 (root_dir / out_txt).write_text("\n".join(lines), encoding="utf-8") print("wrote", idx, "paragraphs to", out_txt) if __name__ == "__main__": dump("xxx控制子系统软件需求规格说明书.docx", "_srs_paragraphs_index.txt") dump("xxx控制子系统软件设计说明书.docx", "_sdd_paragraphs_index.txt")