Files
auto-solution/scripts/_dump_paragraphs.py
2026-05-07 15:40:57 +08:00

41 lines
1.1 KiB
Python

# -*- coding: utf-8 -*-
import zipfile
from pathlib import Path
from xml.etree import ElementTree as ET
NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
W_P = "{%s}p" % NS["w"]
W_T = "{%s}t" % NS["w"]
def para_text(p):
parts = []
for t in p.iter(W_T):
if t.text:
parts.append(t.text)
if t.tail:
parts.append(t.tail)
return "".join(parts)
def dump(docx_name, out_txt):
root_dir = Path(__file__).resolve().parents[1]
z = zipfile.ZipFile(root_dir / docx_name)
root = ET.fromstring(z.read("word/document.xml"))
body = root.find("w:body", NS)
lines = []
idx = 0
for c in body:
if c.tag != W_P:
continue
t = para_text(c).replace("\n", " ").strip()
lines.append(f"{idx}\t{t[:200]}")
idx += 1
(root_dir / out_txt).write_text("\n".join(lines), encoding="utf-8")
print("wrote", idx, "paragraphs to", out_txt)
if __name__ == "__main__":
dump("xxx控制子系统软件需求规格说明书.docx", "_srs_paragraphs_index.txt")
dump("xxx控制子系统软件设计说明书.docx", "_sdd_paragraphs_index.txt")