Compare commits

...

4 Commits

Author SHA1 Message Date
yujj128
5f3c61c18c Merge branch 'dev' of http://106.13.42.156:33077/lei_y601/yj_resume 2025-12-06 18:00:31 +08:00
yujj128
0f666f18c1 空值特殊处理 2025-12-06 18:00:04 +08:00
雷雨
c00328ed8d feat:处理文件修改为异步 2025-12-06 17:39:21 +08:00
yujj128
16583dbb06 简历提取,写入 2025-12-06 17:04:05 +08:00
5 changed files with 22 additions and 16 deletions

View File

@@ -4,7 +4,7 @@ from sqlalchemy.orm import declarative_base, sessionmaker
Base = declarative_base() Base = declarative_base()
from decouple import config from decouple import config
DB_PATH = config('DB_PATH', default='D://PycharmProject//yj_resume//main.sqlite3') DB_PATH = config('DB_PATH', default='E://pyptoject//yj_resume//main.sqlite3')
class DBTASK(Base): class DBTASK(Base):

15
main.py
View File

@@ -2,7 +2,7 @@ from fastapi import FastAPI
import uvicorn import uvicorn
from fastapi import FastAPI, File, UploadFile, HTTPException from fastapi import FastAPI, File, UploadFile, HTTPException
from typing import List from typing import List
from service.file_service import check_and_create_directory, upload_and_save_file,fetch_files from service.file_service import check_and_create_directory, upload_and_save_file, fetch_files
from service import excel_service from service import excel_service
from service.db_service import get_task_list from service.db_service import get_task_list
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
@@ -14,7 +14,8 @@ logger = logging.getLogger(__name__)
app = FastAPI() app = FastAPI()
import concurrent.futures
executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
@app.get("/api/v1/hw") @app.get("/api/v1/hw")
def read_root(): def read_root():
return {"Hello": "World"} return {"Hello": "World"}
@@ -26,11 +27,12 @@ async def create_upload_files(files: List[UploadFile] = File(...)):
dir_id = check_and_create_directory(files) dir_id = check_and_create_directory(files)
if not dir_id: if not dir_id:
return {"result": False, "code": 500, "message": "create directory failed"} return {"result": False, "code": 500, "message": "create directory failed"}
flag, message= await upload_and_save_file(dir_id, files) flag, message = await upload_and_save_file(dir_id, files)
logger.info(f"flag is {flag}") logger.info(f"flag is {flag}")
if flag: if flag:
flag,message = await fetch_files(dir_id) #flag, message = await fetch_files(dir_id)
return {"result": flag, "message": message,"task_id": dir_id} executor.submit(fetch_files, dir_id)
return {"result": flag, "message": message, "task_id": dir_id}
@app.get("/export_task_data_to_excel") @app.get("/export_task_data_to_excel")
@@ -52,4 +54,5 @@ def parse_task_list():
if __name__ == '__main__': if __name__ == '__main__':
uvicorn.run(app, host="127.0.0.1", port=3006) logger.info("start server")
uvicorn.run(app, host="0.0.0.0", port=3006)

View File

@@ -4,8 +4,8 @@ import pandas as pd
import pathlib import pathlib
from decouple import config from decouple import config
# BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//') BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//') #BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//uploads//')
# 导出数据到excel # 导出数据到excel

View File

@@ -16,15 +16,16 @@ import logging
from service.parse_resume2_doc import extra_resume from service.parse_resume2_doc import extra_resume
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//') #BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//uploads//')
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
def check_and_create_directory(files): def check_and_create_directory(files):
logger.info("check_and_create_directory in service") logger.info("check_and_create_directory in service")
# 先创建一个task # 先创建一个task
if not files or len(files) == 0: if not files or len(files) == 0:
logger.warning("check_and_create_directory is empty")
return None return None
id = str(uuid.uuid4()) id = str(uuid.uuid4())
current_time = datetime.now() current_time = datetime.now()
@@ -68,17 +69,17 @@ async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
except Exception as e: except Exception as e:
print(f"Failed to save DBRESUME error {e}") print(f"Failed to save DBRESUME error {e}")
session.rollback() session.rollback()
return False, f"Failed to save DBRESUME error {e}",[] return False, f"Failed to save DBRESUME error {e}"
finally: finally:
session.close() session.close()
return True, "success" return True, "success"
async def fetch_files(dir_id) -> (bool, str): def fetch_files(dir_id) -> (bool, str):
logger.info(f"start fetching files task {dir_id} in service") logger.info(f"start fetching files task {dir_id} in service")
if not os.path.exists(BASE_PATH): if not os.path.exists(BASE_PATH):
logger.info(f"目录{BASE_PATH}不存在") logger.info(f"目录{BASE_PATH}不存在")
return None return False, f"Failed to fetch file 目录{BASE_PATH}不存在"
file_extensions = ['.docx', '.doc'] file_extensions = ['.docx', '.doc']
files_list = [] files_list = []
dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id) dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id)

View File

@@ -124,10 +124,12 @@ class EnhancedDocxExtractor:
spec_coll = ['全日制教育','在职教育'] spec_coll = ['全日制教育','在职教育']
if current_key_cell['text'].replace('\n','') in spec_coll : if current_key_cell['text'].replace('\n','') in spec_coll :
if not value_cell['text']: if not value_cell['text']:
value_cell['text'] = 'False' value_cell['text'] = ""
else: else:
value_cell['text'] = 'True' value_cell['text'] = ''
if not value_cell['text']:
value_cell['text'] = "None"
if value_cell['text'] and (key_row, key_col + 1) not in visited: if value_cell['text'] and (key_row, key_col + 1) not in visited:
# 检查这个值是否与前一个键提取的值相同(可能是合并单元格) # 检查这个值是否与前一个键提取的值相同(可能是合并单元格)
if not self._is_key_duplicate_merged_cell(structure[key_row][key_col]['text'], kv_pairs): if not self._is_key_duplicate_merged_cell(structure[key_row][key_col]['text'], kv_pairs):