简历提取,写入

This commit is contained in:
yujj128
2025-12-06 16:44:53 +08:00
parent e9d225939a
commit a124651a7e
6 changed files with 163 additions and 17 deletions

View File

@@ -3,7 +3,8 @@ from sqlalchemy.orm import declarative_base, sessionmaker
# 申明基类对象
Base = declarative_base()
from decouple import config
DB_PATH = config('DB_PATH', default='E://pyptoject//yj_resume//main.sqlite3')
DB_PATH = config('DB_PATH', default='D://PycharmProject//yj_resume//main.sqlite3')
class DBTASK(Base):

62
logging_config.py Normal file
View File

@@ -0,0 +1,62 @@
# logging_config.py
import logging
import logging.config
from pathlib import Path
# 确保 logs 目录存在
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)
LOGGING_CONFIG = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"default": {
"format": "%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
},
"detailed": {
"format": "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s",
}
},
"handlers": {
"console": {
"class": "logging.StreamHandler",
"level": "INFO",
"formatter": "default",
"stream": "ext://sys.stdout"
},
"file": {
"class": "logging.handlers.RotatingFileHandler", # 自动轮转
"level": "INFO",
"formatter": "detailed",
"filename": "logs/resume.log",
"maxBytes": 10485760, # 10MB
"backupCount": 5, # 保留5个备份
"encoding": "utf8"
},
},
"root": {
"level": "INFO",
"handlers": ["console", "file"]
},
"loggers": {
"uvicorn": {
"level": "INFO",
"handlers": ["console", "file"],
"propagate": False
},
"uvicorn.error": {
"level": "INFO",
"handlers": ["console", "file"],
"propagate": False
},
"uvicorn.access": {
"level": "WARNING", # 只记录警告以上,避免刷屏
"handlers": ["file"], # 只写入文件
"propagate": False
}
}
}
# 应用配置
logging.config.dictConfig(LOGGING_CONFIG)

13
main.py
View File

@@ -2,8 +2,13 @@ from fastapi import FastAPI
import uvicorn
from fastapi import FastAPI, File, UploadFile, HTTPException
from typing import List
from service.file_service import check_and_create_directory, upload_and_save_file
from service.file_service import check_and_create_directory, upload_and_save_file,fetch_files
from service import excel_service
import threading
from logging_config import LOGGING_CONFIG
import logging
logger = logging.getLogger(__name__)
app = FastAPI()
@@ -20,10 +25,10 @@ async def create_upload_files(files: List[UploadFile] = File(...)):
if not dir_id:
return {"result": False, "code": 500, "message": "create directory failed"}
flag, message= await upload_and_save_file(dir_id, files)
logger.info(f"flag is {flag}")
if flag:
# 触发异步任务,解析文件 TODO
pass
return {"result": flag, "message": message}
flag,message = await fetch_files(dir_id)
return {"result": flag, "message": message,"task_id": dir_id}
@app.get("/export_task_data_to_excel")

View File

@@ -4,7 +4,8 @@ import pandas as pd
import pathlib
from decouple import config
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
# BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//')
# 导出数据到excel

View File

@@ -1,3 +1,7 @@
import json
from pymupdf import message
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy
import uuid
from datetime import datetime
@@ -6,11 +10,19 @@ import pathlib
from fastapi import File, UploadFile
from typing import List
import os
import asyncio
import logging
from service.parse_resume2_doc import extra_resume
logger = logging.getLogger(__name__)
BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//')
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
def check_and_create_directory(files):
logger.info("check_and_create_directory in service")
# 先创建一个task
if not files or len(files) == 0:
return None
@@ -32,19 +44,20 @@ def check_and_create_directory(files):
async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
logger.info(f"upload_and_save_file in service dir_id {dir_id}")
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
pathxx.mkdir(parents=True, exist_ok=True)
data = []
i = 0
for file in files:
name, fix = os.path.splitext(file.filename)
id = str(uuid.uuid4())
if fix not in ['.doc', '.docx']:
continue
i = i + 1
with open(pathxx.joinpath(str(i) + fix), 'wb') as f:
with open(pathxx.joinpath(id + fix), 'wb') as f:
file_content = await file.read()
f.write(file_content)
data.append(DBRESUME(id=str(uuid.uuid4()), task_id=dir_id, status=0, file_name=str(i) + fix))
data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + fix))
session = SqliteSqlalchemy().session
try:
session.bulk_save_objects(data)
@@ -52,8 +65,72 @@ async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
except Exception as e:
print(f"Failed to save DBRESUME error {e}")
session.rollback()
return False, f"Failed to save DBRESUME error {e}"
return False, f"Failed to save DBRESUME error {e}",[]
finally:
session.close()
return True, "success"
async def fetch_files(dir_id) -> (bool, str):
logger.info(f"start fetching files task {dir_id} in service")
if not os.path.exists(BASE_PATH):
logger.info(f"目录{BASE_PATH}不存在")
return None
file_extensions = ['.docx', '.doc']
files_list = []
dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id)
for root,dirs,files in os.walk(dir_path):
for file in files:
_,ext = os.path.splitext(file)
if file_extensions and ext not in file_extensions:
logger.error(f"文件{file}格式不符合预期")
continue
file_path = os.path.join(root,file)
if os.path.isfile(file_path):
files_list.append(file_path)
else:
logger.error(f"路径下{file_path}不是文件")
update_success_mapping = []
update_fail_mapping = []
for file in files_list:
logger.info(f"file is {file} {os.path.basename(file)}")
file_name = os.path.basename(file)
id = os.path.splitext(file_name)[0]
result = extra_resume(file)
result = json.dumps(result, ensure_ascii=False)
logger.info(f"result type is {type(result)}")
logger.info(f"file content is {result}")
if not result:
logger.warning(f"file {file_name} 提取为空")
update_fail_mapping.append({'id':id, 'status':0,
'message': f"task {dir_id} => file {file_name} 提取为空"})
continue
update_success_mapping.append({'id':id, 'status':1,'data_info': result})
session = SqliteSqlalchemy().session
logger.info(f"update success mapping => {update_success_mapping}")
logger.info(f"update fail mapping => {update_fail_mapping}")
success_num = len(update_success_mapping)
fail_num = len(update_fail_mapping)
try:
update_data = update_success_mapping + update_fail_mapping
session.bulk_update_mappings(DBRESUME, update_data)
if update_fail_mapping:
session.bulk_update_mappings(DBTASK, [{'id':dir_id, 'status':2, 'success_num':success_num,
'fail_num':fail_num,'message':f'fail => {update_fail_mapping}'}])
else:
session.bulk_update_mappings(DBTASK, [{'id': dir_id, 'status': 1,
'success_num': success_num, 'fail_num': fail_num}])
session.commit()
except Exception as e:
logger.error(f"update failed => task {dir_id} error {e}")
session.rollback()
return False, f"Failed to update DBRESUME error {e}"
finally:
session.close()
return True, 'success'

View File

@@ -376,9 +376,9 @@ def extra_resume(file_path):
return res
if __name__ == "__main__":
# 使用方法
docx_file = "../1.报名登记表.docx" # 替换为你的文件
print(extra_resume(docx_file))
# if __name__ == "__main__":
# # 使用方法
# docx_file = "../1.报名登记表.docx" # 替换为你的文件
# print(extra_resume(docx_file))