Compare commits
18 Commits
351df35642
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5f3c61c18c | ||
|
|
0f666f18c1 | ||
|
|
c00328ed8d | ||
|
|
16583dbb06 | ||
|
|
ec0995d08a | ||
|
|
a124651a7e | ||
|
|
a7ddfcde2a | ||
|
|
43af924920 | ||
|
|
e9d225939a | ||
|
|
ff1c0e890c | ||
|
|
9fd3376557 | ||
|
|
8f35513063 | ||
|
|
f32aa61c0f | ||
|
|
4e8995eaed | ||
|
|
f1063146d2 | ||
|
|
992bab2887 | ||
|
|
eb32528f7e | ||
|
|
dcc6db2363 |
9
Dockerfile
Normal file
9
Dockerfile
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
FROM docker.m.daocloud.io/python:3.12-slim
|
||||||
|
WORKDIR /app
|
||||||
|
COPY . /app
|
||||||
|
ENV TZ=Asia/Shanghai \
|
||||||
|
LANG=C.UTF-8
|
||||||
|
RUN rm -rf logs .git .idea .venv && apt-get update && apt-get install -y vim curl sqlite3 && pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
|
||||||
|
RUN mkdir -p /app/uploads
|
||||||
|
EXPOSE 3006
|
||||||
|
CMD ["python", "main.py"]
|
||||||
0
db/__init__.py
Normal file
0
db/__init__.py
Normal file
48
db/sql_db.py
Normal file
48
db/sql_db.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
from sqlalchemy import Column, DateTime, Integer, BigInteger, String, create_engine, Boolean, Text
|
||||||
|
from sqlalchemy.orm import declarative_base, sessionmaker
|
||||||
|
# 申明基类对象
|
||||||
|
Base = declarative_base()
|
||||||
|
from decouple import config
|
||||||
|
|
||||||
|
DB_PATH = config('DB_PATH', default='E://pyptoject//yj_resume//main.sqlite3')
|
||||||
|
|
||||||
|
|
||||||
|
class DBTASK(Base):
|
||||||
|
__tablename__ = 'db_task'
|
||||||
|
id = Column(String(100), primary_key=True)
|
||||||
|
name = Column(String(100), nullable=False)
|
||||||
|
create_time = Column(DateTime, nullable=False, )
|
||||||
|
# 0 代表待执行,1 成功,2 失败
|
||||||
|
status = Column(Integer, nullable=False, default=0)
|
||||||
|
success_num = Column(Integer, nullable=False, default=0)
|
||||||
|
total_num = Column(Integer, nullable=False, default=0)
|
||||||
|
fail_num = Column(Integer, nullable=False, default=0)
|
||||||
|
message = Column(Text, nullable=True)
|
||||||
|
|
||||||
|
|
||||||
|
class DBRESUME(Base):
|
||||||
|
__tablename__ = 'db_resume'
|
||||||
|
id = Column(String(100), primary_key=True)
|
||||||
|
# 每个任务对应一个文件夹ID
|
||||||
|
task_id = Column(String(100), nullable=False)
|
||||||
|
# 0 代表待执行,1 成功,2 失败
|
||||||
|
status = Column(Integer, nullable=False, default=0)
|
||||||
|
file_name = Column(String(100), nullable=True)
|
||||||
|
# 可以用json表示提取的数据
|
||||||
|
data_info = Column(Text, nullable=True)
|
||||||
|
# 错误信息等
|
||||||
|
message = Column(Text, nullable=True)
|
||||||
|
|
||||||
|
|
||||||
|
class SqliteSqlalchemy(object):
|
||||||
|
def __init__(self):
|
||||||
|
# 创建sqlite连接引擎
|
||||||
|
engine = create_engine(f'sqlite:///{DB_PATH}', echo=True)
|
||||||
|
# 创建表
|
||||||
|
Base.metadata.create_all(engine, checkfirst=True)
|
||||||
|
# 创建sqlite的session连接对象
|
||||||
|
self.session = sessionmaker(bind=engine)()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
62
logging_config.py
Normal file
62
logging_config.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# logging_config.py
|
||||||
|
import logging
|
||||||
|
import logging.config
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# 确保 logs 目录存在
|
||||||
|
log_dir = Path("logs")
|
||||||
|
log_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
LOGGING_CONFIG = {
|
||||||
|
"version": 1,
|
||||||
|
"disable_existing_loggers": False,
|
||||||
|
"formatters": {
|
||||||
|
"default": {
|
||||||
|
"format": "%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
|
||||||
|
},
|
||||||
|
"detailed": {
|
||||||
|
"format": "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"handlers": {
|
||||||
|
"console": {
|
||||||
|
"class": "logging.StreamHandler",
|
||||||
|
"level": "INFO",
|
||||||
|
"formatter": "default",
|
||||||
|
"stream": "ext://sys.stdout"
|
||||||
|
},
|
||||||
|
"file": {
|
||||||
|
"class": "logging.handlers.RotatingFileHandler", # 自动轮转
|
||||||
|
"level": "INFO",
|
||||||
|
"formatter": "detailed",
|
||||||
|
"filename": "logs/resume.log",
|
||||||
|
"maxBytes": 10485760, # 10MB
|
||||||
|
"backupCount": 5, # 保留5个备份
|
||||||
|
"encoding": "utf8"
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"root": {
|
||||||
|
"level": "INFO",
|
||||||
|
"handlers": ["console", "file"]
|
||||||
|
},
|
||||||
|
"loggers": {
|
||||||
|
"uvicorn": {
|
||||||
|
"level": "INFO",
|
||||||
|
"handlers": ["console", "file"],
|
||||||
|
"propagate": False
|
||||||
|
},
|
||||||
|
"uvicorn.error": {
|
||||||
|
"level": "INFO",
|
||||||
|
"handlers": ["console", "file"],
|
||||||
|
"propagate": False
|
||||||
|
},
|
||||||
|
"uvicorn.access": {
|
||||||
|
"level": "WARNING", # 只记录警告以上,避免刷屏
|
||||||
|
"handlers": ["file"], # 只写入文件
|
||||||
|
"propagate": False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 应用配置
|
||||||
|
logging.config.dictConfig(LOGGING_CONFIG)
|
||||||
50
main.py
50
main.py
@@ -1,14 +1,58 @@
|
|||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||||
|
from typing import List
|
||||||
|
from service.file_service import check_and_create_directory, upload_and_save_file, fetch_files
|
||||||
|
from service import excel_service
|
||||||
|
from service.db_service import get_task_list
|
||||||
|
from fastapi.responses import FileResponse
|
||||||
|
import threading
|
||||||
|
from logging_config import LOGGING_CONFIG
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
|
|
||||||
|
import concurrent.futures
|
||||||
|
executor = concurrent.futures.ThreadPoolExecutor(max_workers=10)
|
||||||
@app.get("/api/v1/hw")
|
@app.get("/api/v1/hw")
|
||||||
def read_root():
|
def read_root():
|
||||||
return {"Hello": "World"}
|
return {"Hello": "World"}
|
||||||
|
|
||||||
|
|
||||||
|
# 上传文件并解析,解析是异步错误
|
||||||
|
@app.post("/upload_files_and_parse")
|
||||||
|
async def create_upload_files(files: List[UploadFile] = File(...)):
|
||||||
|
dir_id = check_and_create_directory(files)
|
||||||
|
if not dir_id:
|
||||||
|
return {"result": False, "code": 500, "message": "create directory failed"}
|
||||||
|
flag, message = await upload_and_save_file(dir_id, files)
|
||||||
|
logger.info(f"flag is {flag}")
|
||||||
|
if flag:
|
||||||
|
#flag, message = await fetch_files(dir_id)
|
||||||
|
executor.submit(fetch_files, dir_id)
|
||||||
|
return {"result": flag, "message": message, "task_id": dir_id}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/export_task_data_to_excel")
|
||||||
|
def export_task_data_to_excel(task_id: str):
|
||||||
|
path_xx = excel_service.export_task_data_to_excel(task_id)
|
||||||
|
if not path_xx:
|
||||||
|
raise HTTPException(status_code=404, detail="file not found")
|
||||||
|
return FileResponse(
|
||||||
|
path=path_xx,
|
||||||
|
media_type="application/octet-stream", # 通用二进制流
|
||||||
|
filename=f"{task_id}.xlsx" # 浏览器下载时使用的文件名
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/parse_task_list")
|
||||||
|
def parse_task_list():
|
||||||
|
data = get_task_list()
|
||||||
|
return {"data": data, "code": 200, }
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
uvicorn.run(app, host="127.0.0.1", port=3006)
|
logger.info("start server")
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=3006)
|
||||||
|
|||||||
@@ -1,3 +1,18 @@
|
|||||||
python-docx== 0.8.11
|
python-docx
|
||||||
fastapi
|
fastapi
|
||||||
uvicorn
|
uvicorn
|
||||||
|
docxtpl
|
||||||
|
SQLAlchemy
|
||||||
|
python-decouple
|
||||||
|
python-multipart
|
||||||
|
pandas
|
||||||
|
openpyxl
|
||||||
|
|
||||||
|
python-multipart
|
||||||
|
PyMuPDF>=1.23.0
|
||||||
|
paddlepaddle>=2.5.0
|
||||||
|
paddleocr>=2.7.0.3
|
||||||
|
opencv-python>=4.8.0
|
||||||
|
numpy>=1.24.0
|
||||||
|
pdf2image>=1.16.3
|
||||||
|
Pillow>=10.0.0
|
||||||
0
service/__init__.py
Normal file
0
service/__init__.py
Normal file
18
service/db_service.py
Normal file
18
service/db_service.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy
|
||||||
|
|
||||||
|
|
||||||
|
def get_task_list():
|
||||||
|
task_list = SqliteSqlalchemy().session.query(DBTASK).order_by(DBTASK.create_time.desc()).all()
|
||||||
|
result=[]
|
||||||
|
for task in task_list:
|
||||||
|
result.append({
|
||||||
|
"id": task.id,
|
||||||
|
"name": task.name,
|
||||||
|
"success_num": task.success_num,
|
||||||
|
"fail_num": task.fail_num,
|
||||||
|
"status": task.status,
|
||||||
|
"total_num": task.total_num,
|
||||||
|
"message": task.message,
|
||||||
|
"create_time": task.create_time.strftime("%Y-%m-%d %H:%M:%S") if task.create_time else None,
|
||||||
|
})
|
||||||
|
return result
|
||||||
35
service/excel_service.py
Normal file
35
service/excel_service.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy
|
||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
import pathlib
|
||||||
|
from decouple import config
|
||||||
|
|
||||||
|
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
|
||||||
|
#BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//uploads//')
|
||||||
|
|
||||||
|
|
||||||
|
# 导出数据到excel
|
||||||
|
def export_to_excel(task_id):
|
||||||
|
# 获取所有成功的信息
|
||||||
|
list_data = SqliteSqlalchemy().session.query(DBRESUME).filter_by(task_id=task_id, status=1).all()
|
||||||
|
pd_data = []
|
||||||
|
for data in list_data:
|
||||||
|
pd_data.append(json.loads(data.data_info))
|
||||||
|
data_frame = pd.DataFrame(pd_data)
|
||||||
|
# 导出到excel
|
||||||
|
pathxx = pathlib.Path(BASE_PATH).joinpath(task_id)
|
||||||
|
pathxx = pathxx.joinpath(f"{task_id}.xlsx")
|
||||||
|
data_frame.to_excel(pathxx, index=False)
|
||||||
|
|
||||||
|
|
||||||
|
def export_task_data_to_excel(task_id):
|
||||||
|
pathxx = pathlib.Path(BASE_PATH).joinpath(task_id)
|
||||||
|
pathxx = pathxx.joinpath(f"{task_id}.xlsx")
|
||||||
|
if pathxx.exists():
|
||||||
|
return pathxx
|
||||||
|
session = SqliteSqlalchemy().session
|
||||||
|
task = session.query(DBTASK).filter_by(id=task_id).first()
|
||||||
|
if not task or task.status == 0 or task.status == 2:
|
||||||
|
return None
|
||||||
|
export_to_excel(task_id)
|
||||||
|
return pathxx
|
||||||
140
service/file_service.py
Normal file
140
service/file_service.py
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
from pymupdf import message
|
||||||
|
|
||||||
|
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime
|
||||||
|
from decouple import config
|
||||||
|
import pathlib
|
||||||
|
from fastapi import File, UploadFile
|
||||||
|
from typing import List
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from service.parse_resume2_doc import extra_resume
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
#BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//uploads//')
|
||||||
|
|
||||||
|
BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
|
||||||
|
|
||||||
|
|
||||||
|
def check_and_create_directory(files):
|
||||||
|
logger.info("check_and_create_directory in service")
|
||||||
|
# 先创建一个task
|
||||||
|
if not files or len(files) == 0:
|
||||||
|
logger.warning("check_and_create_directory is empty")
|
||||||
|
return None
|
||||||
|
id = str(uuid.uuid4())
|
||||||
|
current_time = datetime.now()
|
||||||
|
# 格式化时间为字符串
|
||||||
|
formatted_time = current_time.strftime("%Y-%m-%d-%H-%M-%S")
|
||||||
|
task = DBTASK(id=id, create_time=datetime.now(), status=0, success_num=0, total_num=len(files),
|
||||||
|
fail_num=0,name=f"解析任务({formatted_time})")
|
||||||
|
|
||||||
|
session = SqliteSqlalchemy().session
|
||||||
|
try:
|
||||||
|
session.add(task)
|
||||||
|
session.commit()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to save DBTASK info error {e}")
|
||||||
|
session.rollback()
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
return id
|
||||||
|
|
||||||
|
|
||||||
|
async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
|
||||||
|
logger.info(f"upload_and_save_file in service dir_id {dir_id}")
|
||||||
|
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
|
||||||
|
pathxx.mkdir(parents=True, exist_ok=True)
|
||||||
|
data = []
|
||||||
|
for file in files:
|
||||||
|
name, fix = os.path.splitext(file.filename)
|
||||||
|
id = str(uuid.uuid4())
|
||||||
|
if fix not in ['.doc', '.docx']:
|
||||||
|
continue
|
||||||
|
with open(pathxx.joinpath(id + fix), 'wb') as f:
|
||||||
|
file_content = await file.read()
|
||||||
|
f.write(file_content)
|
||||||
|
|
||||||
|
data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + fix))
|
||||||
|
session = SqliteSqlalchemy().session
|
||||||
|
try:
|
||||||
|
session.bulk_save_objects(data)
|
||||||
|
session.commit()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to save DBRESUME error {e}")
|
||||||
|
session.rollback()
|
||||||
|
return False, f"Failed to save DBRESUME error {e}"
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
return True, "success"
|
||||||
|
|
||||||
|
def fetch_files(dir_id) -> (bool, str):
|
||||||
|
|
||||||
|
logger.info(f"start fetching files task {dir_id} in service")
|
||||||
|
if not os.path.exists(BASE_PATH):
|
||||||
|
logger.info(f"目录{BASE_PATH}不存在")
|
||||||
|
return False, f"Failed to fetch file 目录{BASE_PATH}不存在"
|
||||||
|
file_extensions = ['.docx', '.doc']
|
||||||
|
files_list = []
|
||||||
|
dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id)
|
||||||
|
for root,dirs,files in os.walk(dir_path):
|
||||||
|
for file in files:
|
||||||
|
_,ext = os.path.splitext(file)
|
||||||
|
if file_extensions and ext not in file_extensions:
|
||||||
|
logger.error(f"文件{file}格式不符合预期")
|
||||||
|
continue
|
||||||
|
file_path = os.path.join(root,file)
|
||||||
|
if os.path.isfile(file_path):
|
||||||
|
files_list.append(file_path)
|
||||||
|
else:
|
||||||
|
logger.error(f"路径下{file_path}不是文件")
|
||||||
|
update_success_mapping = []
|
||||||
|
update_fail_mapping = []
|
||||||
|
for file in files_list:
|
||||||
|
logger.info(f"file is {file} {os.path.basename(file)}")
|
||||||
|
file_name = os.path.basename(file)
|
||||||
|
id = os.path.splitext(file_name)[0]
|
||||||
|
result = extra_resume(file)
|
||||||
|
result = json.dumps(result, ensure_ascii=False)
|
||||||
|
logger.info(f"result type is {type(result)}")
|
||||||
|
logger.info(f"file content is {result}")
|
||||||
|
if not result:
|
||||||
|
logger.warning(f"file {file_name} 提取为空")
|
||||||
|
update_fail_mapping.append({'id':id, 'status':0,
|
||||||
|
'message': f"task {dir_id} => file {file_name} 提取为空"})
|
||||||
|
continue
|
||||||
|
update_success_mapping.append({'id':id, 'status':1,'data_info': result})
|
||||||
|
session = SqliteSqlalchemy().session
|
||||||
|
logger.info(f"update success mapping => {update_success_mapping}")
|
||||||
|
logger.info(f"update fail mapping => {update_fail_mapping}")
|
||||||
|
success_num = len(update_success_mapping)
|
||||||
|
fail_num = len(update_fail_mapping)
|
||||||
|
try:
|
||||||
|
update_data = update_success_mapping + update_fail_mapping
|
||||||
|
session.bulk_update_mappings(DBRESUME, update_data)
|
||||||
|
|
||||||
|
if update_fail_mapping:
|
||||||
|
session.bulk_update_mappings(DBTASK, [{'id':dir_id, 'status':2, 'success_num':success_num,
|
||||||
|
'fail_num':fail_num,'message':f'fail => {update_fail_mapping}'}])
|
||||||
|
else:
|
||||||
|
session.bulk_update_mappings(DBTASK, [{'id': dir_id, 'status': 1,
|
||||||
|
'success_num': success_num, 'fail_num': fail_num}])
|
||||||
|
session.commit()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"update failed => task {dir_id} error {e}")
|
||||||
|
session.rollback()
|
||||||
|
return False, f"Failed to update DBRESUME error {e}"
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
return True, 'success'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
35
service/format_template_resume.py
Normal file
35
service/format_template_resume.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from docxtpl import DocxTemplate
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
context = {
|
||||||
|
'name': '张三',
|
||||||
|
'sex': '男',
|
||||||
|
'nation': '汉族',
|
||||||
|
'brith': '1990-01-01',
|
||||||
|
'address': '北京市海淀区西二旗',
|
||||||
|
'education': '本科',
|
||||||
|
'degree': '学士',
|
||||||
|
# 籍贯
|
||||||
|
'origin': '山东',
|
||||||
|
'politics': '党员',
|
||||||
|
# 部门
|
||||||
|
'department': '数信部',
|
||||||
|
'position': '助理开发工程师',
|
||||||
|
'phone': '13812345678',
|
||||||
|
'title': '后端开发工程师',
|
||||||
|
'start_work_time': '2018-01-01',
|
||||||
|
# 身份证
|
||||||
|
'id_number': '500221199001010010101',
|
||||||
|
# 荣誉,交给大模型
|
||||||
|
'honor': '一等奖',
|
||||||
|
# 工作内容
|
||||||
|
'work_text': '''
|
||||||
|
2023.12-2024.10:负责《边缘计算+5G自组网的水电物联网系统建设与研究》项目异常检测算法和项目实施:利用5G自组网技术、自建边缘计算单元等,实际实现在线异常检测、时间序列趋势分析、模型轻量化等功能,缓解通信带宽压力;在观音岩、彭水、渝能等场站实施应用。完成项目科技成果凝练、项目报奖等工作。本项目工作获得第六届全国设备管理与技术创新成果一等奖、中电联职工创新成果二等奖。
|
||||||
|
2024.04-2025.至今:广西河池源网荷储一体化项目/大唐西藏玉曲河扎拉电厂可行性研究报告&方案编写、AI支持中心方案策划
|
||||||
|
'''
|
||||||
|
|
||||||
|
}
|
||||||
|
file_path = Path.cwd().joinpath('template.docx')
|
||||||
|
template = DocxTemplate(file_path)
|
||||||
|
template.render(context)
|
||||||
|
template.save('E://resu//output.docx')
|
||||||
386
service/parse_resume2_doc.py
Normal file
386
service/parse_resume2_doc.py
Normal file
@@ -0,0 +1,386 @@
|
|||||||
|
import re
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docx import Document
|
||||||
|
from typing import Dict, List, Any, Tuple
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
class EnhancedDocxExtractor:
|
||||||
|
def __init__(self):
|
||||||
|
# 定义字段名称的多种变体
|
||||||
|
self.field_variants = {
|
||||||
|
'姓名': ['姓名', '姓 名', '姓 名', '姓名:', '姓 名:','姓 名'],
|
||||||
|
'性别': ['性别', '性 别', '性 别', '性别:', '性 别:','性 别'],
|
||||||
|
'出生年月': ['出生年月', '出生年月:', '出生日期', '出生日期:'],
|
||||||
|
'民族': ['民族', '民族:', '民 族'],
|
||||||
|
'政治面貌': ['政治面貌', '政治面貌:', '政治面貌:'],
|
||||||
|
'现任职单位及部门': ['现任职单位及部门', '单位及部门', '工作单位', '现任职单位'],
|
||||||
|
'联系电话': ['联系电话', '电话', '手机', '联系电话:', '手机号'],
|
||||||
|
'联系地址': ['联系地址', '地址', '联系地址:', '家庭地址'],
|
||||||
|
'学历学位': ['学历', '学历:', '学 历', '学历\n学位','学位','学位:','学 位'],
|
||||||
|
'毕业院校': ['毕业院校', '毕业学校', '毕业院校:','毕业院校系及专业'],
|
||||||
|
'专业': ['专业', '专业:', '系及专业', '所学专业'],
|
||||||
|
}
|
||||||
|
|
||||||
|
def extract_with_table_structure(self, docx_path: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
提取 .docx 中的表格结构数据
|
||||||
|
"""
|
||||||
|
doc = Document(docx_path)
|
||||||
|
results = defaultdict(dict)
|
||||||
|
# 分析每个表格
|
||||||
|
for table_idx, table in enumerate(doc.tables):
|
||||||
|
print(f"\n处理表格 {table_idx + 1} ({len(table.rows)}行 × {len(table.columns)}列)")
|
||||||
|
|
||||||
|
# 获取表格结构
|
||||||
|
table_structure = self._analyze_table_structure(table)
|
||||||
|
# 提取键值对
|
||||||
|
kv_pairs = self._extract_from_table_structure(table, table_structure)
|
||||||
|
# 分类存储
|
||||||
|
for key, value in kv_pairs:
|
||||||
|
category = self._categorize_field(key)
|
||||||
|
results[category][key] = value
|
||||||
|
# 提取段落中的信息
|
||||||
|
paragraph_info = self._extract_from_paragraphs(doc.paragraphs)
|
||||||
|
for key, value in paragraph_info:
|
||||||
|
category = self._categorize_field(key)
|
||||||
|
results[category][key] = value
|
||||||
|
|
||||||
|
return dict(results)
|
||||||
|
|
||||||
|
def _analyze_table_structure(self, table) -> List[List[Dict]]:
|
||||||
|
"""
|
||||||
|
分析表格结构,返回每个单元格的元信息
|
||||||
|
"""
|
||||||
|
structure = []
|
||||||
|
|
||||||
|
for row_idx, row in enumerate(table.rows):
|
||||||
|
row_info = []
|
||||||
|
for col_idx, cell in enumerate(row.cells):
|
||||||
|
cell_text = cell.text.strip()
|
||||||
|
# 分析单元格属性
|
||||||
|
cell_info = {
|
||||||
|
'text': cell_text,
|
||||||
|
'row': row_idx,
|
||||||
|
'col': col_idx,
|
||||||
|
'rowspan': 1,
|
||||||
|
'colspan': 1,
|
||||||
|
'is_key': self._is_likely_key(cell_text),
|
||||||
|
'is_value': self._is_likely_value(cell_text),
|
||||||
|
}
|
||||||
|
row_info.append(cell_info)
|
||||||
|
structure.append(row_info)
|
||||||
|
|
||||||
|
return structure
|
||||||
|
|
||||||
|
def _extract_from_table_structure(self, table, structure) -> List[Tuple[str, str]]:
|
||||||
|
"""
|
||||||
|
从表格结构中提取键值对
|
||||||
|
"""
|
||||||
|
kv_pairs = []
|
||||||
|
visited = set()
|
||||||
|
key_recode = []
|
||||||
|
for row_idx, row in enumerate(structure):
|
||||||
|
for col_idx, cell in enumerate(row):
|
||||||
|
print(f"visited is {visited} ")
|
||||||
|
print(f'row {row_idx} col {col_idx} all cell is {cell}')
|
||||||
|
if (row_idx, col_idx) in visited:
|
||||||
|
print(f'---{row_idx}, {col_idx} ')
|
||||||
|
print(f'cell is {cell}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
if cell['is_key']:
|
||||||
|
next_cell = structure[row_idx][col_idx+1]
|
||||||
|
# 寻找对应的值
|
||||||
|
print(f"cell2 is {cell} row {row_idx} col {col_idx}")
|
||||||
|
value = self._find_value_for_key(table, structure, row_idx, col_idx, visited, kv_pairs)
|
||||||
|
if value:
|
||||||
|
key = self._normalize_key(cell['text'])
|
||||||
|
found = False
|
||||||
|
kv_pairs = [(k,v+","+value)if k == cell['text'] else (k, v) for k,v in kv_pairs ]
|
||||||
|
for i, (k,v) in enumerate(kv_pairs):
|
||||||
|
if k == cell['text']:
|
||||||
|
kv_pairs[i] = (k,value)
|
||||||
|
found = True
|
||||||
|
if not found:
|
||||||
|
kv_pairs.append((key, value))
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("不是key")
|
||||||
|
return kv_pairs
|
||||||
|
|
||||||
|
def _find_value_for_key(self, table, structure, key_row, key_col, visited, kv_pairs) -> str:
|
||||||
|
"""
|
||||||
|
为键找到对应的值
|
||||||
|
"""
|
||||||
|
# 尝试右侧单元格
|
||||||
|
if key_col + 1 < len(structure[key_row]):
|
||||||
|
value_cell = structure[key_row][key_col + 1]
|
||||||
|
current_key_cell = structure[key_row][key_col]
|
||||||
|
if value_cell['is_key']:
|
||||||
|
return None
|
||||||
|
# 特殊处理学历
|
||||||
|
spec_coll = ['全日制教育','在职教育']
|
||||||
|
if current_key_cell['text'].replace('\n','') in spec_coll :
|
||||||
|
if not value_cell['text']:
|
||||||
|
value_cell['text'] = "否"
|
||||||
|
else:
|
||||||
|
value_cell['text'] = '是'
|
||||||
|
|
||||||
|
if not value_cell['text']:
|
||||||
|
value_cell['text'] = "None"
|
||||||
|
if value_cell['text'] and (key_row, key_col + 1) not in visited:
|
||||||
|
# 检查这个值是否与前一个键提取的值相同(可能是合并单元格)
|
||||||
|
if not self._is_key_duplicate_merged_cell(structure[key_row][key_col]['text'], kv_pairs):
|
||||||
|
print("前一个不重复")
|
||||||
|
print(f"visited add {key_row} {key_col + 1}")
|
||||||
|
visited.add((key_row, key_col + 1))
|
||||||
|
return value_cell['text']
|
||||||
|
else:
|
||||||
|
current_key = structure[key_row][key_col]['text']
|
||||||
|
print(f"key值重复------------------------------key {current_key}")
|
||||||
|
for key, value in kv_pairs:
|
||||||
|
if key == current_key:
|
||||||
|
return value+","+value_cell['text']
|
||||||
|
|
||||||
|
|
||||||
|
# 尝试下方单元格
|
||||||
|
if key_row + 1 < len(structure):
|
||||||
|
value_cell = structure[key_row + 1][key_col]
|
||||||
|
if value_cell['text'] and (key_row + 1, key_col) not in visited:
|
||||||
|
# 检查这个值是否与前一个键提取的值相同(可能是合并单元格)
|
||||||
|
if not self._is_key_duplicate_merged_cell(structure[key_row][key_col]['text'], kv_pairs):
|
||||||
|
print("下一个不重复")
|
||||||
|
print(f"visited add {key_row} {key_col + 1}")
|
||||||
|
visited.add((key_row + 1, key_col))
|
||||||
|
return value_cell['text']
|
||||||
|
|
||||||
|
# 尝试合并单元格的情况
|
||||||
|
for row_idx in range(len(structure)):
|
||||||
|
for col_idx in range(len(structure[row_idx])):
|
||||||
|
cell = structure[row_idx][col_idx]
|
||||||
|
if (row_idx, col_idx) not in visited and cell['text']:
|
||||||
|
# 检查是否在键的附近
|
||||||
|
if abs(row_idx - key_row) <= 2 and abs(col_idx - key_col) <= 2:
|
||||||
|
# 检查这个值是否与前一个键提取的值相同
|
||||||
|
if not self._is_key_duplicate_merged_cell(structure[key_row][key_col]['text'], kv_pairs):
|
||||||
|
print("合并不重复")
|
||||||
|
print(f"visited add {key_row} {key_col + 1}")
|
||||||
|
visited.add((row_idx, col_idx))
|
||||||
|
return cell['text']
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _is_key_duplicate_merged_cell(self, text, kv_pairs) -> bool:
|
||||||
|
"""
|
||||||
|
检查当前文本value是否可能和已收录的kv集合里的key值重复
|
||||||
|
如下例:1行0列 ,2行0列 都是毕业院校
|
||||||
|
第一次 1行0列:1行2列组成key:value
|
||||||
|
第二次到2行0列,检测到 毕业院校已出现在kv_pairs中,不再组合2行0列:2行1列
|
||||||
|
| 硕士学位/研究生学历:中国科学院计算技术研究所计算机技术专业
|
||||||
|
毕业院校 |——————————————————————————————————————————————————
|
||||||
|
|
|
||||||
|
|————————————————————————————————————————————————————
|
||||||
|
"""
|
||||||
|
|
||||||
|
for k, v in kv_pairs:
|
||||||
|
if text == k:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def extract_parentheses_content(self, text):
|
||||||
|
# 使用正则表达式提取括号内的所有内容
|
||||||
|
matches = re.findall(r'[((]([^))]*)[))]', text)
|
||||||
|
|
||||||
|
return matches # 返回列表,可能包含多个括号
|
||||||
|
|
||||||
|
def _is_likely_key(self, text: str) -> bool:
|
||||||
|
"""判断文本是否可能是键"""
|
||||||
|
if not text or len(text) > 20:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 检查是否包含常见字段词
|
||||||
|
key_indicators = ['籍贯','籍 贯','政治面貌','政治\n面貌','姓名','性别','姓 名', '性 别', '出生年月', '民族','民 族', '单位', '部门','联系地址','主要学习经历','全日制教育','在职教育',
|
||||||
|
'职务','职 务','职\n务', '职称','职 称', '电话', '地址', '学历', '学位','现任职务','职业资格','奖惩情况(近三年主要奖惩信息)'
|
||||||
|
'专业', '岗位', '经历', '时间', '资格','现任职单位及部门','身份证号','婚姻状况','健康状况','应聘岗位','应聘部门/岗位','毕业院校系及专业']
|
||||||
|
|
||||||
|
for indicator in key_indicators:
|
||||||
|
translation_table = str.maketrans('', '', ' \t\n\r\f\v')
|
||||||
|
indicator = indicator.translate(translation_table)
|
||||||
|
text = text.translate(translation_table)
|
||||||
|
if indicator in text:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 检查是否有冒号(中文文档常用)
|
||||||
|
if ':' in text or ':' in text:
|
||||||
|
key_part = text.split(':')[0].split(':')[0]
|
||||||
|
if any(indicator in key_part for indicator in key_indicators):
|
||||||
|
return True
|
||||||
|
|
||||||
|
for indicator in key_indicators:
|
||||||
|
print("indicator is ===============================", indicator)
|
||||||
|
print("text is ===============================", text)
|
||||||
|
translation_table = str.maketrans('', '', ' \t\n\r\f\v')
|
||||||
|
indicator = indicator.translate(translation_table)
|
||||||
|
text = text.translate(translation_table)
|
||||||
|
clean_text = self.extract_parentheses_content(text)
|
||||||
|
print(text)
|
||||||
|
clean_indicator = self.extract_parentheses_content(indicator)
|
||||||
|
print(indicator)
|
||||||
|
if not clean_text:
|
||||||
|
print("特殊匹配失败")
|
||||||
|
return False
|
||||||
|
if clean_indicator:
|
||||||
|
print("开始匹配=========")
|
||||||
|
clean_text = clean_text[0]
|
||||||
|
clean_indicator = clean_indicator[0]
|
||||||
|
if clean_indicator in clean_text:
|
||||||
|
print(f"特殊情况匹配成功======={text}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("继续匹配")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _is_likely_value(self, text: str) -> bool:
|
||||||
|
"""判断文本是否可能是值"""
|
||||||
|
if not text:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 值通常不是常见的字段名
|
||||||
|
if self._is_likely_key(text):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 值可能包含特定内容
|
||||||
|
if re.match(r'^\d{11}$', text): # 手机号
|
||||||
|
return True
|
||||||
|
if re.match(r'^\d{4}年', text): # 日期
|
||||||
|
return True
|
||||||
|
if len(text) > 10: # 长文本可能是值
|
||||||
|
return True
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _normalize_key(self, key_text: str) -> str:
|
||||||
|
"""标准化键名"""
|
||||||
|
# 移除冒号和空格
|
||||||
|
key_text = re.sub(r'[::\s]+$', '', key_text)
|
||||||
|
|
||||||
|
# 映射到标准键名
|
||||||
|
for std_key, variants in self.field_variants.items():
|
||||||
|
for variant in variants:
|
||||||
|
if variant == key_text or key_text in variant:
|
||||||
|
return std_key
|
||||||
|
|
||||||
|
return key_text
|
||||||
|
|
||||||
|
def _categorize_field(self, key: str) -> str:
|
||||||
|
"""将字段分类"""
|
||||||
|
categories = {
|
||||||
|
'基本信息': ['姓名', '性别', '出生年月', '民族', '政治面貌','学历学位','毕业院校系及专业','全日制教育','在职教育'
|
||||||
|
'婚姻状况', '健康状况', '籍贯', '身份证号','联系电话','婚姻状况','健康状况','身份证号','联系电话(手机)','毕业院校系及专业','联系地址','主要学习经历','奖惩情况(近三年主要奖惩信息)'],
|
||||||
|
'工作信息': ['现任职单位及部门', '现任职务', '职称', '职业资格',
|
||||||
|
'参加工作时间', '职称取得时间','应聘部门/岗位','是否接受调剂职级/岗位','奖惩情况(近三年主要奖惩信息)'],
|
||||||
|
}
|
||||||
|
|
||||||
|
for category, fields in categories.items():
|
||||||
|
translation_table = str.maketrans('', '', ' \t\n\r\f\v')
|
||||||
|
key = key.translate(translation_table)
|
||||||
|
if key in fields:
|
||||||
|
# print(f"filed is {fields} key is {key} ")
|
||||||
|
return category
|
||||||
|
|
||||||
|
return '其他信息'
|
||||||
|
|
||||||
|
def _extract_from_paragraphs(self, paragraphs) -> List[Tuple[str, str]]:
|
||||||
|
"""从段落中提取信息"""
|
||||||
|
kv_pairs = []
|
||||||
|
|
||||||
|
for para in paragraphs:
|
||||||
|
text = para.text.strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 尝试提取冒号分隔的键值对
|
||||||
|
if ':' in text or ':' in text:
|
||||||
|
separator = ':' if ':' in text else ':'
|
||||||
|
parts = text.split(separator, 1)
|
||||||
|
|
||||||
|
if len(parts) == 2:
|
||||||
|
key = parts[0].strip()
|
||||||
|
value = parts[1].strip()
|
||||||
|
|
||||||
|
if self._is_likely_key(key) and value:
|
||||||
|
normalized_key = self._normalize_key(key)
|
||||||
|
kv_pairs.append((normalized_key, value))
|
||||||
|
|
||||||
|
return kv_pairs
|
||||||
|
|
||||||
|
|
||||||
|
# 快速使用示例
|
||||||
|
def quick_extract(docx_path: str):
|
||||||
|
"""快速提取并显示结果"""
|
||||||
|
extractor = EnhancedDocxExtractor()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = extractor.extract_with_table_structure(docx_path)
|
||||||
|
print("\n提取结果 (键值对格式):")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
for category, fields in result.items():
|
||||||
|
if fields:
|
||||||
|
print(f"\n{category}:")
|
||||||
|
for key, value in fields.items():
|
||||||
|
print(f" {key}: {value}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"提取失败: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
base_map = ['姓名','性别','籍贯','政治面貌','出生年月','身份证号','现居住地','民族','学历','学位','学历学位','特长','联系电话','联系电话(手机)',
|
||||||
|
'婚姻状况','健康状况','毕业院校系及专业','主要学习经历','联系地址','入党/团时间','全日制教育','在职教育','奖惩情况(近三年主要奖惩信息)']
|
||||||
|
work_map = ['参加工作时间','现任职单位及部门','职务','现任职务','职称','奖惩','工作经历','主要工作经历','职称取得时间','职业资格','应聘部门/岗位']
|
||||||
|
other_map = ['工作经历','主要工作经历','职称取得时间','职业资格','应聘部门/岗位','是否接受调剂职级/岗位']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_info(data):
|
||||||
|
map_word = base_map + work_map + other_map
|
||||||
|
print("data is {0}".format(data))
|
||||||
|
print("map_word is {0}".format(map_word))
|
||||||
|
final_res = {}
|
||||||
|
for key, value in data.items():
|
||||||
|
translation_table = str.maketrans('', '', ' \t\n\r\f\v')
|
||||||
|
clean_key = key.translate(translation_table)
|
||||||
|
print(f"key is {clean_key} ")
|
||||||
|
if clean_key in map_word:
|
||||||
|
# clean_value = value.translate(translation_table)
|
||||||
|
final_res[clean_key] = value
|
||||||
|
|
||||||
|
return final_res
|
||||||
|
|
||||||
|
|
||||||
|
def extra_resume(file_path):
|
||||||
|
result = quick_extract(file_path)
|
||||||
|
print(result)
|
||||||
|
base_data = result['基本信息']
|
||||||
|
work_data = result['工作信息']
|
||||||
|
other_data = result['其他信息']
|
||||||
|
data = {}
|
||||||
|
data.update(base_data)
|
||||||
|
data.update(work_data)
|
||||||
|
data.update(other_data)
|
||||||
|
res = fetch_info(data)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
# if __name__ == "__main__":
|
||||||
|
# # 使用方法
|
||||||
|
# docx_file = "../1.报名登记表.docx" # 替换为你的文件
|
||||||
|
# print(extra_resume(docx_file))
|
||||||
|
|
||||||
|
|
||||||
BIN
service/template.docx
Normal file
BIN
service/template.docx
Normal file
Binary file not shown.
Reference in New Issue
Block a user