简历提取,写入

This commit is contained in:
yujj128
2025-12-06 16:44:53 +08:00
parent e9d225939a
commit a124651a7e
6 changed files with 163 additions and 17 deletions

View File

@@ -3,7 +3,8 @@ from sqlalchemy.orm import declarative_base, sessionmaker
# 申明基类对象 # 申明基类对象
Base = declarative_base() Base = declarative_base()
from decouple import config from decouple import config
DB_PATH = config('DB_PATH', default='E://pyptoject//yj_resume//main.sqlite3')
DB_PATH = config('DB_PATH', default='D://PycharmProject//yj_resume//main.sqlite3')
class DBTASK(Base): class DBTASK(Base):

62
logging_config.py Normal file
View File

@@ -0,0 +1,62 @@
# logging_config.py
import logging
import logging.config
from pathlib import Path
# 确保 logs 目录存在
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)
LOGGING_CONFIG = {
"version": 1,
"disable_existing_loggers": False,
"formatters": {
"default": {
"format": "%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
},
"detailed": {
"format": "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s",
}
},
"handlers": {
"console": {
"class": "logging.StreamHandler",
"level": "INFO",
"formatter": "default",
"stream": "ext://sys.stdout"
},
"file": {
"class": "logging.handlers.RotatingFileHandler", # 自动轮转
"level": "INFO",
"formatter": "detailed",
"filename": "logs/resume.log",
"maxBytes": 10485760, # 10MB
"backupCount": 5, # 保留5个备份
"encoding": "utf8"
},
},
"root": {
"level": "INFO",
"handlers": ["console", "file"]
},
"loggers": {
"uvicorn": {
"level": "INFO",
"handlers": ["console", "file"],
"propagate": False
},
"uvicorn.error": {
"level": "INFO",
"handlers": ["console", "file"],
"propagate": False
},
"uvicorn.access": {
"level": "WARNING", # 只记录警告以上,避免刷屏
"handlers": ["file"], # 只写入文件
"propagate": False
}
}
}
# 应用配置
logging.config.dictConfig(LOGGING_CONFIG)

15
main.py
View File

@@ -2,8 +2,13 @@ from fastapi import FastAPI
import uvicorn import uvicorn
from fastapi import FastAPI, File, UploadFile, HTTPException from fastapi import FastAPI, File, UploadFile, HTTPException
from typing import List from typing import List
from service.file_service import check_and_create_directory, upload_and_save_file from service.file_service import check_and_create_directory, upload_and_save_file,fetch_files
from service import excel_service from service import excel_service
import threading
from logging_config import LOGGING_CONFIG
import logging
logger = logging.getLogger(__name__)
app = FastAPI() app = FastAPI()
@@ -19,11 +24,11 @@ async def create_upload_files(files: List[UploadFile] = File(...)):
dir_id = check_and_create_directory(files) dir_id = check_and_create_directory(files)
if not dir_id: if not dir_id:
return {"result": False, "code": 500, "message": "create directory failed"} return {"result": False, "code": 500, "message": "create directory failed"}
flag, message = await upload_and_save_file(dir_id, files) flag, message= await upload_and_save_file(dir_id, files)
logger.info(f"flag is {flag}")
if flag: if flag:
# 触发异步任务,解析文件 TODO flag,message = await fetch_files(dir_id)
pass return {"result": flag, "message": message,"task_id": dir_id}
return {"result": flag, "message": message}
@app.get("/export_task_data_to_excel") @app.get("/export_task_data_to_excel")

View File

@@ -4,7 +4,8 @@ import pandas as pd
import pathlib import pathlib
from decouple import config from decouple import config
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//') # BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//')
# 导出数据到excel # 导出数据到excel

View File

@@ -1,3 +1,7 @@
import json
from pymupdf import message
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy
import uuid import uuid
from datetime import datetime from datetime import datetime
@@ -6,11 +10,19 @@ import pathlib
from fastapi import File, UploadFile from fastapi import File, UploadFile
from typing import List from typing import List
import os import os
import asyncio
import logging
from service.parse_resume2_doc import extra_resume
logger = logging.getLogger(__name__)
BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//')
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
def check_and_create_directory(files): def check_and_create_directory(files):
logger.info("check_and_create_directory in service")
# 先创建一个task # 先创建一个task
if not files or len(files) == 0: if not files or len(files) == 0:
return None return None
@@ -32,19 +44,20 @@ def check_and_create_directory(files):
async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str): async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
logger.info(f"upload_and_save_file in service dir_id {dir_id}")
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id) pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
pathxx.mkdir(parents=True, exist_ok=True) pathxx.mkdir(parents=True, exist_ok=True)
data = [] data = []
i = 0
for file in files: for file in files:
name, fix = os.path.splitext(file.filename) name, fix = os.path.splitext(file.filename)
id = str(uuid.uuid4())
if fix not in ['.doc', '.docx']: if fix not in ['.doc', '.docx']:
continue continue
i = i + 1 with open(pathxx.joinpath(id + fix), 'wb') as f:
with open(pathxx.joinpath(str(i) + fix), 'wb') as f:
file_content = await file.read() file_content = await file.read()
f.write(file_content) f.write(file_content)
data.append(DBRESUME(id=str(uuid.uuid4()), task_id=dir_id, status=0, file_name=str(i) + fix))
data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + fix))
session = SqliteSqlalchemy().session session = SqliteSqlalchemy().session
try: try:
session.bulk_save_objects(data) session.bulk_save_objects(data)
@@ -52,8 +65,72 @@ async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
except Exception as e: except Exception as e:
print(f"Failed to save DBRESUME error {e}") print(f"Failed to save DBRESUME error {e}")
session.rollback() session.rollback()
return False, f"Failed to save DBRESUME error {e}" return False, f"Failed to save DBRESUME error {e}",[]
finally: finally:
session.close() session.close()
return True, "success" return True, "success"
async def fetch_files(dir_id) -> (bool, str):
logger.info(f"start fetching files task {dir_id} in service")
if not os.path.exists(BASE_PATH):
logger.info(f"目录{BASE_PATH}不存在")
return None
file_extensions = ['.docx', '.doc']
files_list = []
dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id)
for root,dirs,files in os.walk(dir_path):
for file in files:
_,ext = os.path.splitext(file)
if file_extensions and ext not in file_extensions:
logger.error(f"文件{file}格式不符合预期")
continue
file_path = os.path.join(root,file)
if os.path.isfile(file_path):
files_list.append(file_path)
else:
logger.error(f"路径下{file_path}不是文件")
update_success_mapping = []
update_fail_mapping = []
for file in files_list:
logger.info(f"file is {file} {os.path.basename(file)}")
file_name = os.path.basename(file)
id = os.path.splitext(file_name)[0]
result = extra_resume(file)
result = json.dumps(result, ensure_ascii=False)
logger.info(f"result type is {type(result)}")
logger.info(f"file content is {result}")
if not result:
logger.warning(f"file {file_name} 提取为空")
update_fail_mapping.append({'id':id, 'status':0,
'message': f"task {dir_id} => file {file_name} 提取为空"})
continue
update_success_mapping.append({'id':id, 'status':1,'data_info': result})
session = SqliteSqlalchemy().session
logger.info(f"update success mapping => {update_success_mapping}")
logger.info(f"update fail mapping => {update_fail_mapping}")
success_num = len(update_success_mapping)
fail_num = len(update_fail_mapping)
try:
update_data = update_success_mapping + update_fail_mapping
session.bulk_update_mappings(DBRESUME, update_data)
if update_fail_mapping:
session.bulk_update_mappings(DBTASK, [{'id':dir_id, 'status':2, 'success_num':success_num,
'fail_num':fail_num,'message':f'fail => {update_fail_mapping}'}])
else:
session.bulk_update_mappings(DBTASK, [{'id': dir_id, 'status': 1,
'success_num': success_num, 'fail_num': fail_num}])
session.commit()
except Exception as e:
logger.error(f"update failed => task {dir_id} error {e}")
session.rollback()
return False, f"Failed to update DBRESUME error {e}"
finally:
session.close()
return True, 'success'

View File

@@ -376,9 +376,9 @@ def extra_resume(file_path):
return res return res
if __name__ == "__main__": # if __name__ == "__main__":
# 使用方法 # # 使用方法
docx_file = "../1.报名登记表.docx" # 替换为你的文件 # docx_file = "../1.报名登记表.docx" # 替换为你的文件
print(extra_resume(docx_file)) # print(extra_resume(docx_file))