|
|
|
|
|
import argparse |
|
import os |
|
from pathlib import Path |
|
import shutil |
|
import tempfile |
|
import uuid |
|
|
|
from docx2md.docxfile import DocxFile, DocxFileError |
|
from docx2md.docxmedia import DocxMedia |
|
from docx2md.converter import Converter |
|
|
|
from project_settings import project_path |
|
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown |
|
from toolbox.os.command import Command |
|
|
|
|
|
def get_args(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument( |
|
"--filename", |
|
|
|
default=(project_path / "data/files/doc/坏账处理流程 v1.0.docx").as_posix(), |
|
|
|
type=str |
|
) |
|
args = parser.parse_args() |
|
return args |
|
|
|
|
|
class DocxToMarkdown(BaseToMarkdown): |
|
def __init__(self, filename: str): |
|
super().__init__(filename) |
|
self.docx = DocxFile(self.filename) |
|
self.media = DocxMedia(self.docx) |
|
|
|
def get_md_text(self, use_md_table: bool = True) -> str: |
|
xml_text = self.docx.document() |
|
|
|
converter = Converter( |
|
xml_text, |
|
self.media, |
|
use_md_table |
|
) |
|
md_text = converter.convert() |
|
return md_text |
|
|
|
def save_to_zip(self, output_dir: str): |
|
basename = str(uuid.uuid4()) |
|
|
|
temp_dir = Path(tempfile.gettempdir()) / basename |
|
temp_dir.mkdir(parents=True, exist_ok=False) |
|
|
|
self.media.save(temp_dir) |
|
|
|
md_file = temp_dir / f"{basename}.md" |
|
md_text = self.get_md_text(use_md_table=True) |
|
with open(md_file.as_posix(), "w", encoding="utf-8") as f: |
|
f.write(md_text) |
|
output_zip_file = os.path.join(output_dir, f"{basename}.zip") |
|
|
|
|
|
self.zip_directory(temp_dir, output_zip_file) |
|
shutil.rmtree(temp_dir) |
|
return output_zip_file |
|
|
|
|
|
@BaseToMarkdown.register("docx2md") |
|
class Docx2md(BaseToMarkdown): |
|
def __init__(self, filename: str): |
|
super().__init__(filename) |
|
|
|
def command(self, filename: str, output_file: str): |
|
cmd = f'python -m docx2md -m "{filename}" "{output_file}"' |
|
Command.popen(cmd) |
|
return cmd |
|
|
|
def save_to_zip(self, output_dir: str): |
|
basename = str(uuid.uuid4()) |
|
|
|
temp_dir = Path(tempfile.gettempdir()) / basename |
|
temp_dir.mkdir(parents=True, exist_ok=False) |
|
|
|
md_file = temp_dir / f"{basename}.md" |
|
|
|
self.command(self.filename, md_file.as_posix()) |
|
|
|
|
|
output_zip_file = os.path.join(output_dir, f"{basename}.zip") |
|
self.zip_directory(temp_dir, output_zip_file) |
|
shutil.rmtree(temp_dir) |
|
return output_zip_file |
|
|
|
|
|
def main(): |
|
args = get_args() |
|
|
|
d2m = Docx2md(args.filename) |
|
|
|
output_zip_file = d2m.save_to_zip(output_dir=".") |
|
print(output_zip_file) |
|
return |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|