feat:增加文件格式化,下载,压缩

This commit is contained in:
雷雨
2025-12-10 10:48:39 +08:00
parent 15d778fdb9
commit f32104994d
5 changed files with 169 additions and 31 deletions

View File

@@ -12,6 +12,7 @@ class DBTASK(Base):
id = Column(String(100), primary_key=True)
name = Column(String(100), nullable=False)
create_time = Column(DateTime, nullable=False, )
task_type = Column(String(20), nullable=False, )
# 0 代表待执行1 成功2 失败
status = Column(Integer, nullable=False, default=0)
success_num = Column(Integer, nullable=False, default=0)

52
main.py
View File

@@ -2,40 +2,45 @@ from fastapi import FastAPI
import uvicorn
from fastapi import FastAPI, File, UploadFile, HTTPException
from typing import List
from service.file_service import check_and_create_directory, upload_and_save_file, fetch_files
from service.file_service import download_format_words, check_and_create_directory, upload_and_format_file, \
upload_and_save_file, fetch_files, fetch_and_format_file
from service import excel_service
from service.db_service import get_task_list
from fastapi.responses import FileResponse
import threading
from logging_config import LOGGING_CONFIG
import logging
import pandas as pd
logger = logging.getLogger(__name__)
app = FastAPI()
import concurrent.futures
executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
executor = concurrent.futures.ThreadPoolExecutor(max_workers=20)
@app.get("/api/v1/hw")
def read_root():
return {"Hello": "World"}
# 上传文件并解析,解析是异步错误
@app.post("/upload_files_and_parse")
@app.post("/yj_resume/upload_files_and_parse")
async def create_upload_files(files: List[UploadFile] = File(...)):
dir_id = check_and_create_directory(files)
dir_id = check_and_create_directory(files, 'parse')
if not dir_id:
return {"result": False, "code": 500, "message": "create directory failed"}
flag, message = await upload_and_save_file(dir_id, files)
logger.info(f"flag is {flag}")
if flag:
#flag, message = await fetch_files(dir_id)
# flag, message = await fetch_files(dir_id)
executor.submit(fetch_files, dir_id)
return {"result": flag, "message": message, "task_id": dir_id}
@app.get("/export_task_data_to_excel")
@app.get("/yj_resume/export_task_data_to_excel")
def export_task_data_to_excel(task_id: str):
path_xx = excel_service.export_task_data_to_excel(task_id)
if not path_xx:
@@ -47,12 +52,43 @@ def export_task_data_to_excel(task_id: str):
)
@app.get("/parse_task_list")
@app.get("/yj_resume/parse_task_list")
def parse_task_list():
data = get_task_list()
data = get_task_list('parse')
return {"data": data, "code": 200, }
@app.get("/yj_resume/format_task_list")
def format_task_list():
data = get_task_list('format')
return {"data": data, "code": 200, }
@app.post("/yj_resume/upload_files_and_format")
async def create_upload_files(files: List[UploadFile] = File(...)):
dir_id = check_and_create_directory(files, 'format')
if not dir_id:
return {"result": False, "code": 500, "message": "create directory failed"}
flag, message = await upload_and_format_file(dir_id, files)
logger.info(f"flag is {flag}")
if flag:
# flag, message = await fetch_files(dir_id)
executor.submit(fetch_and_format_file, dir_id)
return {"result": flag, "message": message, "task_id": dir_id}
@app.get("/yj_resume/download_format_words")
def export_task_data_to_excel(task_id: str):
path_xx = download_format_words(task_id)
if not path_xx:
raise HTTPException(status_code=404, detail="file not found")
return FileResponse(
path=path_xx,
media_type="application/octet-stream", # 通用二进制流
filename=f"{task_id}.zip" # 浏览器下载时使用的文件名
)
if __name__ == '__main__':
logger.info("start server")
uvicorn.run(app, host="0.0.0.0", port=3006)

View File

@@ -1,9 +1,10 @@
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy
def get_task_list():
task_list = SqliteSqlalchemy().session.query(DBTASK).order_by(DBTASK.create_time.desc()).all()
result=[]
def get_task_list(task_type):
task_list = SqliteSqlalchemy().session.query(DBTASK).filter_by(DBTASK.task_type == task_type).order_by(
DBTASK.create_time.desc()).all()
result = []
for task in task_list:
result.append({
"id": task.id,

View File

@@ -1,6 +1,5 @@
import json
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy
import uuid
from datetime import datetime
@@ -12,16 +11,19 @@ import os
import asyncio
import logging
from logging_config import LOGGING_CONFIG
from service.format_template_resume import format_excel_to_words
from service.parse_resume2_doc import extra_resume
logger = logging.getLogger(__name__)
BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//uploads//')
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
ZIP_PATH = config('ZIP_PATh', default='E://pyptoject//yj_resume//zip//')
#
# BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
import pandas as pd
import zipfile
def check_and_create_directory(files):
def check_and_create_directory(files, task_type):
logger.info("check_and_create_directory in service")
# 先创建一个task
if not files or len(files) == 0:
@@ -31,8 +33,8 @@ def check_and_create_directory(files):
current_time = datetime.now()
# 格式化时间为字符串
formatted_time = current_time.strftime("%Y-%m-%d-%H-%M-%S")
task = DBTASK(id=id, create_time=datetime.now(), status=0, success_num=0, total_num=len(files),
fail_num=0,name=f"解析任务({formatted_time})")
task = DBTASK(id=id, task_type=task_type, create_time=datetime.now(), status=0, success_num=0, total_num=len(files),
fail_num=0, name=f"解析任务({formatted_time})")
session = SqliteSqlalchemy().session
try:
@@ -74,8 +76,8 @@ async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
session.close()
return True, "success"
def fetch_files(dir_id) -> (bool, str):
def fetch_files(dir_id) -> (bool, str):
logger.info(f"start fetching files task {dir_id} in service")
if not os.path.exists(BASE_PATH):
logger.info(f"目录{BASE_PATH}不存在")
@@ -83,13 +85,13 @@ def fetch_files(dir_id) -> (bool, str):
file_extensions = ['.docx', '.doc']
files_list = []
dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id)
for root,dirs,files in os.walk(dir_path):
for root, dirs, files in os.walk(dir_path):
for file in files:
_,ext = os.path.splitext(file)
_, ext = os.path.splitext(file)
if file_extensions and ext not in file_extensions:
logger.error(f"文件{file}格式不符合预期")
continue
file_path = os.path.join(root,file)
file_path = os.path.join(root, file)
if os.path.isfile(file_path):
files_list.append(file_path)
else:
@@ -106,10 +108,10 @@ def fetch_files(dir_id) -> (bool, str):
logger.info(f"file content is {result}")
if not result:
logger.warning(f"file {file_name} 提取为空")
update_fail_mapping.append({'id':id, 'status':0,
'message': f"task {dir_id} => file {file_name} 提取为空"})
update_fail_mapping.append({'id': id, 'status': 0,
'message': f"task {dir_id} => file {file_name} 提取为空"})
continue
update_success_mapping.append({'id':id, 'status':1,'data_info': result})
update_success_mapping.append({'id': id, 'status': 1, 'data_info': result})
session = SqliteSqlalchemy().session
logger.info(f"update success mapping => {update_success_mapping}")
logger.info(f"update fail mapping => {update_fail_mapping}")
@@ -120,8 +122,8 @@ def fetch_files(dir_id) -> (bool, str):
session.bulk_update_mappings(DBRESUME, update_data)
if update_fail_mapping:
session.bulk_update_mappings(DBTASK, [{'id':dir_id, 'status':2, 'success_num':success_num,
'fail_num':fail_num,'message':f'fail => {update_fail_mapping}'}])
session.bulk_update_mappings(DBTASK, [{'id': dir_id, 'status': 2, 'success_num': success_num,
'fail_num': fail_num, 'message': f'fail => {update_fail_mapping}'}])
else:
session.bulk_update_mappings(DBTASK, [{'id': dir_id, 'status': 1,
'success_num': success_num, 'fail_num': fail_num}])
@@ -136,5 +138,48 @@ def fetch_files(dir_id) -> (bool, str):
return True, 'success'
async def upload_and_format_file(dir_id, files: List[UploadFile]) -> (bool, str):
logger.info(f"upload_and_format_file in service dir_id {dir_id}")
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
pathxx.mkdir(parents=True, exist_ok=True)
for file in files:
name, fix = os.path.splitext(file.filename)
if fix not in ['.xls', '.xlsx']:
continue
with open(pathxx.joinpath(dir_id + fix), 'wb') as f:
file_content = await file.read()
f.write(file_content)
return True, "success"
def zip_file_folder(dir_id):
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
output_filename = pathlib.Path(ZIP_PATH).joinpath((dir_id + ".zip"))
with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
# 遍历文件夹中的所有文件和子文件夹
for root, dirs, files in os.walk(pathxx):
for file in files:
# 构建完整的文件路径
file_path = os.path.join(root, file)
# 将文件添加到ZIP文件中使用相对于原始文件夹的路径
zipf.write(file_path, arcname=os.path.relpath(file_path, pathxx))
def fetch_and_format_file(dir_id) -> (bool, str):
logger.info(f"fetch_and_format_file in service dir_id {dir_id}")
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
pathx_1 = pathxx.joinpath(dir_id + ".xlsx")
if not pathx_1.exists():
pathx_1 = pathxx.joinpath(dir_id + ".xls")
data = pd.read_excel(pathx_1)
data_dict = data.to_dict('records')
print(data_dict)
format_excel_to_words(dir_id, data_dict)
zip_file_folder(dir_id)
def download_format_words(task_id):
pathxx = pathlib.Path(ZIP_PATH).joinpath((task_id + ".zip"))
if not pathxx.exists():
return None
return pathxx

View File

@@ -1,6 +1,10 @@
from docxtpl import DocxTemplate
from pathlib import Path
from decouple import config
import pathlib,logging
import uuid
logger = logging.getLogger(__name__)
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
context = {
'name': '张三',
'sex': '',
@@ -29,7 +33,58 @@ context = {
'''
}
file_path = Path.cwd().joinpath('template.docx')
template = DocxTemplate(file_path)
template.render(context)
template.save('E://resu//output.docx')
def dict_get_mul_key(keys: list, dict_data: dict):
for k in keys:
if k in dict_data.keys():
return dict_data[k]
return ''
def convert_data(old_dict: dict) -> dict:
new_dict = {}
new_dict['name'] = dict_get_mul_key(['姓名', '姓 名'], old_dict)
new_dict['sex'] = dict_get_mul_key(['性别', '性 别'], old_dict)
new_dict['nation'] = dict_get_mul_key(['民族', '民 族'], old_dict)
new_dict['brith'] = dict_get_mul_key(['出生年月', '出生年月(岁)'], old_dict)
new_dict['origin'] = dict_get_mul_key(['籍贯', '籍 贯'], old_dict)
new_dict['address'] = dict_get_mul_key(['出 生 地', '出生地'], old_dict)
new_dict['education'] = dict_get_mul_key(['学历', '学 历'], old_dict)
new_dict['degree'] = dict_get_mul_key(['学位', '学 位'], old_dict)
new_dict['politics'] = '党员' if len(dict_get_mul_key(['入党时间', '入 党 时 间'], old_dict)) > 0 else '群众'
new_dict['department'] = dict_get_mul_key(['部门', '部 门'], old_dict)
new_dict['position'] = dict_get_mul_key(['现任职务', '现 任 职 务'], old_dict)
new_dict['phone'] = dict_get_mul_key(['手机号', '手 机 号'], old_dict)
new_dict['title'] = dict_get_mul_key(['专业技术职务', '职 称'], old_dict)
new_dict['start_work_time'] = dict_get_mul_key(['开始工作时间', '开始 工作 时间'], old_dict)
new_dict['id_number'] = dict_get_mul_key(['身份证', '身 份 证'], old_dict)
new_dict['honor'] = dict_get_mul_key(['奖惩情况', '奖惩 情况'], old_dict)
new_dict['work_text'] = dict_get_mul_key(['简历', '简 历'], old_dict)
return new_dict
def format_and_write_file(dir_id: str, ctx: dict):
logger.info(f'format_and_write_file dir id is {dir_id}')
user_name = ctx.get('name', str(uuid.uuid4()))
file_path = Path.cwd().joinpath('template.docx')
print(file_path)
template = DocxTemplate(file_path)
template.render(ctx)
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
pathxx = pathxx.joinpath((user_name + '.docx'))
print(pathxx)
template.save(pathxx)
print('222222222')
def format_excel_to_words(dir_id: str, dict_data_list: list[dict]):
if not dict_data_list or len(dict_data_list) < 1:
return
for dict_data in dict_data_list:
#同时写出一份到数据库后期后继汇总excel
#TODO
new_data = convert_data(dict_data)
print(new_data)
format_and_write_file(dir_id, new_data)