feat:处理doc转docx

2025-12-10 15:25:16 +08:00
parent e057917151
commit 5e70e79365
4 changed files with 55 additions and 34 deletions
--- a/2
+++ b/2
@@ -3,7 +3,7 @@ WORKDIR /app
 COPY . /app
 ENV TZ=Asia/Shanghai \
    LANG=C.UTF-8
-RUN rm -rf  logs .git .idea .venv && apt-get update && apt-get install -y vim curl sqlite3 && pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
+RUN rm -rf  logs .git .idea .venv && apt-get update && apt-get install -y pandoc vim curl sqlite3 && pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
 RUN mkdir -p /app/uploads &&  mkdir -p /app/zip
 EXPOSE 3006
 CMD ["python", "main.py"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,4 @@ openpyxl
 python-multipart
 Pillow>=10.0.0
 numpy
-pywin32
+pypandoc
--- a/service/file_service.py
+++ b/service/file_service.py
@@ -13,6 +13,9 @@ import logging
 from logging_config import LOGGING_CONFIG
 from service.format_template_resume import format_excel_to_words
 from service.parse_resume2_doc import extra_resume
+import pypandoc
+
+

 logger = logging.getLogger(__name__)
 BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
@@ -22,6 +25,48 @@ ZIP_PATH = config('ZIP_PATh', default='E://pyptoject//yj_resume//zip//')
 import pandas as pd
 import zipfile

+import os
+import subprocess
+import msvcrt
+import tempfile
+
+
+def convert_doc_to_docx_secure(input_file,out_put_dir):
+
+
+    # 环境配置
+    with tempfile.TemporaryDirectory() as tmpdir:
+        os.environ['TMP'] = tmpdir
+        os.environ['TEMP'] = tmpdir
+
+        # 构建命令
+        cmd = [
+            'soffice',
+            '--headless',
+            '--nologo',
+            '--nodefault',
+            '--norestore',
+            '--convert-to', 'docx',
+            '--outdir', out_put_dir,
+            input_file
+        ]
+
+        # 执行转换
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=True,
+                timeout=30  # 设置超时防止卡死
+            )
+            return True
+        except subprocess.CalledProcessError as e:
+            print(f"深度错误信息：\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
+            return False
+
+
+

 def check_and_create_directory(files, task_type):
    logger.info("check_and_create_directory in service")
@@ -59,11 +104,13 @@ async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
        id = str(uuid.uuid4())
        if fix not in ['.doc', '.docx']:
            continue
+
        with open(pathxx.joinpath(id + fix), 'wb') as f:
            file_content = await file.read()
            f.write(file_content)
-
-        data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + fix))
+        if fix=='.doc':
+            convert_doc_to_docx_secure(str(pathxx.joinpath(id + fix)),str(pathxx))
+        data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + '.docx'))
    session = SqliteSqlalchemy().session
    try:
        session.bulk_save_objects(data)
@@ -82,7 +129,7 @@ def fetch_files(dir_id) -> (bool, str):
    if not os.path.exists(BASE_PATH):
        logger.info(f"目录{BASE_PATH}不存在")
        return False, f"Failed to fetch file 目录{BASE_PATH}不存在"
-    file_extensions = ['.docx', '.doc']
+    file_extensions = ['.docx']
    files_list = []
    dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id)
    for root, dirs, files in os.walk(dir_path):
--- a/service/parse_resume2_doc.py
+++ b/service/parse_resume2_doc.py
@@ -2,9 +2,8 @@ import os.path
 import re
 import json
 from pathlib import Path
+import  pypandoc

-import pythoncom
-import win32com.client
 from docx import Document
 from typing import Dict, List, Any, Tuple
 from collections import defaultdict
@@ -31,29 +30,8 @@ class EnhancedDocxExtractor:
        }

    def convert_doc_to_docx(self, file_path: str) -> Document:
-        logger.info("into convert_doc_to_docx")
-        pythoncom.CoInitialize()
-        try:
-            word = win32com.client.Dispatch('Word.Application')
-            logger.info(f"test1")
-            word.visible = False
-            logger.info(f"test2")
-            word.DisplayAlerts = False
-            logger.info(f" old file_path:{file_path}")
-            doc = word.Documents.Open(os.path.abspath(file_path))
-            logger.info(f"test3")
-            abspath = os.path.abspath(file_path)
-            docx_path = abspath.replace('.doc', '.docx')
-            logger.info(f"new  docx_path:{docx_path}")
-            doc.SaveAs(docx_path, 16)
-            doc.Close()
-            word.Quit()
-            return docx_path
-        except Exception as e:
-            logger.error(f"错误: {e}")
-            return None
-        finally:
-            pythoncom.CoUninitialize()
+        pass
+



@@ -64,10 +42,6 @@ class EnhancedDocxExtractor:
        提取 .docx 中的表格结构数据
        """
        logger.info(f"into extract_with_table_structure")
-        if (os.path.splitext(docx_path)[1]).lower() == '.doc':
-            logger.info(f"old docx_path:{docx_path} start to convert")
-            docx_path = self.convert_doc_to_docx(docx_path)
-            logger.info(f"new docx_path:{docx_path}")
        doc = Document(docx_path)
        results = defaultdict(dict)
        # 分析每个表格