feat:处理doc转docx
This commit is contained in:
@@ -3,7 +3,7 @@ WORKDIR /app
|
||||
COPY . /app
|
||||
ENV TZ=Asia/Shanghai \
|
||||
LANG=C.UTF-8
|
||||
RUN rm -rf logs .git .idea .venv && apt-get update && apt-get install -y vim curl sqlite3 && pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
|
||||
RUN rm -rf logs .git .idea .venv && apt-get update && apt-get install -y pandoc vim curl sqlite3 && pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
|
||||
RUN mkdir -p /app/uploads && mkdir -p /app/zip
|
||||
EXPOSE 3006
|
||||
CMD ["python", "main.py"]
|
||||
|
||||
@@ -10,4 +10,4 @@ openpyxl
|
||||
python-multipart
|
||||
Pillow>=10.0.0
|
||||
numpy
|
||||
pywin32
|
||||
pypandoc
|
||||
@@ -13,6 +13,9 @@ import logging
|
||||
from logging_config import LOGGING_CONFIG
|
||||
from service.format_template_resume import format_excel_to_words
|
||||
from service.parse_resume2_doc import extra_resume
|
||||
import pypandoc
|
||||
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
|
||||
@@ -22,6 +25,48 @@ ZIP_PATH = config('ZIP_PATh', default='E://pyptoject//yj_resume//zip//')
|
||||
import pandas as pd
|
||||
import zipfile
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import msvcrt
|
||||
import tempfile
|
||||
|
||||
|
||||
def convert_doc_to_docx_secure(input_file,out_put_dir):
|
||||
|
||||
|
||||
# 环境配置
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.environ['TMP'] = tmpdir
|
||||
os.environ['TEMP'] = tmpdir
|
||||
|
||||
# 构建命令
|
||||
cmd = [
|
||||
'soffice',
|
||||
'--headless',
|
||||
'--nologo',
|
||||
'--nodefault',
|
||||
'--norestore',
|
||||
'--convert-to', 'docx',
|
||||
'--outdir', out_put_dir,
|
||||
input_file
|
||||
]
|
||||
|
||||
# 执行转换
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
timeout=30 # 设置超时防止卡死
|
||||
)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"深度错误信息:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
def check_and_create_directory(files, task_type):
|
||||
logger.info("check_and_create_directory in service")
|
||||
@@ -59,11 +104,13 @@ async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
|
||||
id = str(uuid.uuid4())
|
||||
if fix not in ['.doc', '.docx']:
|
||||
continue
|
||||
|
||||
with open(pathxx.joinpath(id + fix), 'wb') as f:
|
||||
file_content = await file.read()
|
||||
f.write(file_content)
|
||||
|
||||
data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + fix))
|
||||
if fix=='.doc':
|
||||
convert_doc_to_docx_secure(str(pathxx.joinpath(id + fix)),str(pathxx))
|
||||
data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + '.docx'))
|
||||
session = SqliteSqlalchemy().session
|
||||
try:
|
||||
session.bulk_save_objects(data)
|
||||
@@ -82,7 +129,7 @@ def fetch_files(dir_id) -> (bool, str):
|
||||
if not os.path.exists(BASE_PATH):
|
||||
logger.info(f"目录{BASE_PATH}不存在")
|
||||
return False, f"Failed to fetch file 目录{BASE_PATH}不存在"
|
||||
file_extensions = ['.docx', '.doc']
|
||||
file_extensions = ['.docx']
|
||||
files_list = []
|
||||
dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id)
|
||||
for root, dirs, files in os.walk(dir_path):
|
||||
|
||||
@@ -2,9 +2,8 @@ import os.path
|
||||
import re
|
||||
import json
|
||||
from pathlib import Path
|
||||
import pypandoc
|
||||
|
||||
import pythoncom
|
||||
import win32com.client
|
||||
from docx import Document
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from collections import defaultdict
|
||||
@@ -31,29 +30,8 @@ class EnhancedDocxExtractor:
|
||||
}
|
||||
|
||||
def convert_doc_to_docx(self, file_path: str) -> Document:
|
||||
logger.info("into convert_doc_to_docx")
|
||||
pythoncom.CoInitialize()
|
||||
try:
|
||||
word = win32com.client.Dispatch('Word.Application')
|
||||
logger.info(f"test1")
|
||||
word.visible = False
|
||||
logger.info(f"test2")
|
||||
word.DisplayAlerts = False
|
||||
logger.info(f" old file_path:{file_path}")
|
||||
doc = word.Documents.Open(os.path.abspath(file_path))
|
||||
logger.info(f"test3")
|
||||
abspath = os.path.abspath(file_path)
|
||||
docx_path = abspath.replace('.doc', '.docx')
|
||||
logger.info(f"new docx_path:{docx_path}")
|
||||
doc.SaveAs(docx_path, 16)
|
||||
doc.Close()
|
||||
word.Quit()
|
||||
return docx_path
|
||||
except Exception as e:
|
||||
logger.error(f"错误: {e}")
|
||||
return None
|
||||
finally:
|
||||
pythoncom.CoUninitialize()
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -64,10 +42,6 @@ class EnhancedDocxExtractor:
|
||||
提取 .docx 中的表格结构数据
|
||||
"""
|
||||
logger.info(f"into extract_with_table_structure")
|
||||
if (os.path.splitext(docx_path)[1]).lower() == '.doc':
|
||||
logger.info(f"old docx_path:{docx_path} start to convert")
|
||||
docx_path = self.convert_doc_to_docx(docx_path)
|
||||
logger.info(f"new docx_path:{docx_path}")
|
||||
doc = Document(docx_path)
|
||||
results = defaultdict(dict)
|
||||
# 分析每个表格
|
||||
|
||||
Reference in New Issue
Block a user