41 lines
1.1 KiB
Python
41 lines
1.1 KiB
Python
|
|
# -*- coding: utf-8 -*-
|
||
|
|
import zipfile
|
||
|
|
from pathlib import Path
|
||
|
|
from xml.etree import ElementTree as ET
|
||
|
|
|
||
|
|
NS = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
||
|
|
W_P = "{%s}p" % NS["w"]
|
||
|
|
W_T = "{%s}t" % NS["w"]
|
||
|
|
|
||
|
|
|
||
|
|
def para_text(p):
|
||
|
|
parts = []
|
||
|
|
for t in p.iter(W_T):
|
||
|
|
if t.text:
|
||
|
|
parts.append(t.text)
|
||
|
|
if t.tail:
|
||
|
|
parts.append(t.tail)
|
||
|
|
return "".join(parts)
|
||
|
|
|
||
|
|
|
||
|
|
def dump(docx_name, out_txt):
|
||
|
|
root_dir = Path(__file__).resolve().parents[1]
|
||
|
|
z = zipfile.ZipFile(root_dir / docx_name)
|
||
|
|
root = ET.fromstring(z.read("word/document.xml"))
|
||
|
|
body = root.find("w:body", NS)
|
||
|
|
lines = []
|
||
|
|
idx = 0
|
||
|
|
for c in body:
|
||
|
|
if c.tag != W_P:
|
||
|
|
continue
|
||
|
|
t = para_text(c).replace("\n", " ").strip()
|
||
|
|
lines.append(f"{idx}\t{t[:200]}")
|
||
|
|
idx += 1
|
||
|
|
(root_dir / out_txt).write_text("\n".join(lines), encoding="utf-8")
|
||
|
|
print("wrote", idx, "paragraphs to", out_txt)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
dump("xxx控制子系统软件需求规格说明书.docx", "_srs_paragraphs_index.txt")
|
||
|
|
dump("xxx控制子系统软件设计说明书.docx", "_sdd_paragraphs_index.txt")
|