Files
yj_resume/service/file_service.py
2025-12-12 15:31:14 +08:00

255 lines
8.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
from sqlalchemy import update
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy, DBEXCEL
import uuid
from datetime import datetime
from decouple import config
import pathlib
from fastapi import File, UploadFile
from typing import List
import os
import asyncio
import logging
from logging_config import LOGGING_CONFIG
from service.format_template_resume import format_excel_to_words
from service.parse_resume2_doc import extra_resume
import pypandoc
logger = logging.getLogger(__name__)
# BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
# ZIP_PATH = config('ZIP_PATh', default='E://pyptoject//yj_resume//zip//')
#
BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//uploads//')
ZIP_PATH = config('ZIP_PATh', default='D://PycharmProject//yj_resume//zip//')
import pandas as pd
import zipfile
import os
import subprocess
import msvcrt
import tempfile
def convert_doc_to_docx_secure(input_file,out_put_dir):
# 环境配置
with tempfile.TemporaryDirectory() as tmpdir:
os.environ['TMP'] = tmpdir
os.environ['TEMP'] = tmpdir
# 构建命令
cmd = [
'soffice',
'--headless',
'--nologo',
'--nodefault',
'--norestore',
'--convert-to', 'docx',
'--outdir', out_put_dir,
input_file
]
# 执行转换
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=True,
timeout=30 # 设置超时防止卡死
)
return True
except subprocess.CalledProcessError as e:
print(f"深度错误信息:\nSTDOUT: {e.stdout}\nSTDERR: {e.stderr}")
return False
def check_and_create_directory(files, task_type):
logger.info("check_and_create_directory in service")
# 先创建一个task
if not files or len(files) == 0:
logger.warning("check_and_create_directory is empty")
return None
id = str(uuid.uuid4())
current_time = datetime.now()
# 格式化时间为字符串
formatted_time = current_time.strftime("%Y-%m-%d-%H-%M-%S")
task = DBTASK(id=id, task_type=task_type, create_time=datetime.now(), status=0, success_num=0, total_num=len(files),
fail_num=0, name=f"解析任务({formatted_time})")
session = SqliteSqlalchemy().session
try:
session.add(task)
session.commit()
except Exception as e:
print(f"Failed to save DBTASK info error {e}")
session.rollback()
return None
finally:
session.close()
return id
async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
logger.info(f"upload_and_save_file in service dir_id {dir_id}")
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
pathxx.mkdir(parents=True, exist_ok=True)
data = []
for file in files:
name, fix = os.path.splitext(file.filename)
id = str(uuid.uuid4())
if fix not in ['.doc', '.docx']:
continue
with open(pathxx.joinpath(id + fix), 'wb') as f:
file_content = await file.read()
f.write(file_content)
if fix=='.doc':
convert_doc_to_docx_secure(str(pathxx.joinpath(id + fix)),str(pathxx))
data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + '.docx'))
session = SqliteSqlalchemy().session
try:
session.bulk_save_objects(data)
session.commit()
except Exception as e:
print(f"Failed to save DBRESUME error {e}")
session.rollback()
return False, f"Failed to save DBRESUME error {e}"
finally:
session.close()
return True, "success"
def fetch_files(dir_id) -> (bool, str):
logger.info(f"start fetching files task {dir_id} in service")
if not os.path.exists(BASE_PATH):
logger.info(f"目录{BASE_PATH}不存在")
return False, f"Failed to fetch file 目录{BASE_PATH}不存在"
file_extensions = ['.docx']
files_list = []
dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id)
for root, dirs, files in os.walk(dir_path):
for file in files:
_, ext = os.path.splitext(file)
if file_extensions and ext not in file_extensions:
logger.error(f"文件{file}格式不符合预期")
continue
file_path = os.path.join(root, file)
if os.path.isfile(file_path):
files_list.append(file_path)
else:
logger.error(f"路径下{file_path}不是文件")
update_success_mapping = []
update_fail_mapping = []
for file in files_list:
logger.info(f"file is {file} {os.path.basename(file)}")
file_name = os.path.basename(file)
id = os.path.splitext(file_name)[0]
result = extra_resume(file)
result = json.dumps(result, ensure_ascii=False)
logger.info(f"result type is {type(result)}")
logger.info(f"file content is {result}")
if not result:
logger.warning(f"file {file_name} 提取为空")
update_fail_mapping.append({'id': id, 'status': 0,
'message': f"task {dir_id} => file {file_name} 提取为空"})
continue
update_success_mapping.append({'id': id, 'status': 1, 'data_info': result})
session = SqliteSqlalchemy().session
logger.info(f"update success mapping => {update_success_mapping}")
logger.info(f"update fail mapping => {update_fail_mapping}")
success_num = len(update_success_mapping)
fail_num = len(update_fail_mapping)
try:
update_data = update_success_mapping + update_fail_mapping
session.bulk_update_mappings(DBRESUME, update_data)
if update_fail_mapping:
session.bulk_update_mappings(DBTASK, [{'id': dir_id, 'status': 2, 'success_num': success_num,
'fail_num': fail_num, 'message': f'fail => {update_fail_mapping}'}])
else:
session.bulk_update_mappings(DBTASK, [{'id': dir_id, 'status': 1,
'success_num': success_num, 'fail_num': fail_num}])
session.commit()
except Exception as e:
logger.error(f"update failed => task {dir_id} error {e}")
session.rollback()
return False, f"Failed to update DBRESUME error {e}"
finally:
session.close()
return True, 'success'
async def upload_and_format_file(dir_id, files: List[UploadFile]) -> (bool, str):
logger.info(f"upload_and_format_file in service dir_id {dir_id}")
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
pathxx.mkdir(parents=True, exist_ok=True)
data = []
for file in files:
# id = str(uuid.uuid4())
name, fix = os.path.splitext(file.filename)
if fix not in ['.xls', '.xlsx']:
continue
with open(pathxx.joinpath(dir_id + fix), 'wb') as f:
file_content = await file.read()
f.write(file_content)
data.append(DBEXCEL(id=dir_id, status=0, file_name=dir_id + '.xlsx'))
session = SqliteSqlalchemy().session
try:
session.bulk_save_objects(data)
session.commit()
except Exception as e:
print(f"Failed to save DBEXCEL error {e}")
session.rollback()
return False, f"Failed to save DBEXCEL error {e}"
finally:
session.close()
return True, "success"
def zip_file_folder(dir_id):
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
output_filename = pathlib.Path(ZIP_PATH).joinpath((dir_id + ".zip"))
with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
# 遍历文件夹中的所有文件和子文件夹
for root, dirs, files in os.walk(pathxx):
for file in files:
# 构建完整的文件路径
file_path = os.path.join(root, file)
# 将文件添加到ZIP文件中使用相对于原始文件夹的路径
zipf.write(file_path, arcname=os.path.relpath(file_path, pathxx))
def fetch_and_format_file(dir_id) -> (bool, str):
logger.info(f"fetch_and_format_file in service dir_id {dir_id}")
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
pathx_1 = pathxx.joinpath(dir_id + ".xlsx")
if not pathx_1.exists():
pathx_1 = pathxx.joinpath(dir_id + ".xls")
data = pd.read_excel(pathx_1)
data_dict = data.to_dict('records')
print(data_dict)
format_excel_to_words(dir_id, data_dict)
zip_file_folder(dir_id)
session = SqliteSqlalchemy().session
try:
session.execute(update(DBTASK).where(DBTASK.id == dir_id).values(status=1))
session.commit()
except Exception as e:
session.rollback()
finally:
session.close()
def download_format_words(task_id):
pathxx = pathlib.Path(ZIP_PATH).joinpath((task_id + ".zip"))
if not pathxx.exists():
return None
return pathxx