feat:处理doc转docx

This commit is contained in:
雷雨
2025-12-10 15:25:16 +08:00
parent e057917151
commit 5e70e79365
4 changed files with 55 additions and 34 deletions

View File

@@ -3,7 +3,7 @@ WORKDIR /app
COPY . /app
ENV TZ=Asia/Shanghai \
LANG=C.UTF-8
RUN rm -rf logs .git .idea .venv && apt-get update && apt-get install -y vim curl sqlite3 && pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
RUN rm -rf logs .git .idea .venv && apt-get update && apt-get install -y pandoc vim curl sqlite3 && pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
RUN mkdir -p /app/uploads && mkdir -p /app/zip
EXPOSE 3006
CMD ["python", "main.py"]

View File

@@ -10,4 +10,4 @@ openpyxl
python-multipart
Pillow>=10.0.0
numpy
pywin32
pypandoc

View File

@@ -13,6 +13,9 @@ import logging
from logging_config import LOGGING_CONFIG
from service.format_template_resume import format_excel_to_words
from service.parse_resume2_doc import extra_resume
import pypandoc
logger = logging.getLogger(__name__)
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
@@ -22,6 +25,48 @@ ZIP_PATH = config('ZIP_PATh', default='E://pyptoject//yj_resume//zip//')
import pandas as pd
import zipfile
import os
import subprocess
import msvcrt
import tempfile
def convert_doc_to_docx_secure(input_file,out_put_dir):
# 环境配置
with tempfile.TemporaryDirectory() as tmpdir:
os.environ['TMP'] = tmpdir
os.environ['TEMP'] = tmpdir
# 构建命令
cmd = [
'soffice',
'--headless',
'--nologo',
'--nodefault',
'--norestore',
'--convert-to', 'docx',
'--outdir', out_put_dir,
input_file
]
# 执行转换
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=True,
timeout=30 # 设置超时防止卡死
)
return True
except subprocess.CalledProcessError as e:
print(f"深度错误信息:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
return False
def check_and_create_directory(files, task_type):
logger.info("check_and_create_directory in service")
@@ -59,11 +104,13 @@ async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
id = str(uuid.uuid4())
if fix not in ['.doc', '.docx']:
continue
with open(pathxx.joinpath(id + fix), 'wb') as f:
file_content = await file.read()
f.write(file_content)
data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + fix))
if fix=='.doc':
convert_doc_to_docx_secure(str(pathxx.joinpath(id + fix)),str(pathxx))
data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + '.docx'))
session = SqliteSqlalchemy().session
try:
session.bulk_save_objects(data)
@@ -82,7 +129,7 @@ def fetch_files(dir_id) -> (bool, str):
if not os.path.exists(BASE_PATH):
logger.info(f"目录{BASE_PATH}不存在")
return False, f"Failed to fetch file 目录{BASE_PATH}不存在"
file_extensions = ['.docx', '.doc']
file_extensions = ['.docx']
files_list = []
dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id)
for root, dirs, files in os.walk(dir_path):

View File

@@ -2,9 +2,8 @@ import os.path
import re
import json
from pathlib import Path
import pypandoc
import pythoncom
import win32com.client
from docx import Document
from typing import Dict, List, Any, Tuple
from collections import defaultdict
@@ -31,29 +30,8 @@ class EnhancedDocxExtractor:
}
def convert_doc_to_docx(self, file_path: str) -> Document:
logger.info("into convert_doc_to_docx")
pythoncom.CoInitialize()
try:
word = win32com.client.Dispatch('Word.Application')
logger.info(f"test1")
word.visible = False
logger.info(f"test2")
word.DisplayAlerts = False
logger.info(f" old file_path:{file_path}")
doc = word.Documents.Open(os.path.abspath(file_path))
logger.info(f"test3")
abspath = os.path.abspath(file_path)
docx_path = abspath.replace('.doc', '.docx')
logger.info(f"new docx_path:{docx_path}")
doc.SaveAs(docx_path, 16)
doc.Close()
word.Quit()
return docx_path
except Exception as e:
logger.error(f"错误: {e}")
return None
finally:
pythoncom.CoUninitialize()
pass
@@ -64,10 +42,6 @@ class EnhancedDocxExtractor:
提取 .docx 中的表格结构数据
"""
logger.info(f"into extract_with_table_structure")
if (os.path.splitext(docx_path)[1]).lower() == '.doc':
logger.info(f"old docx_path:{docx_path} start to convert")
docx_path = self.convert_doc_to_docx(docx_path)
logger.info(f"new docx_path:{docx_path}")
doc = Document(docx_path)
results = defaultdict(dict)
# 分析每个表格