Compare commits
14 Commits
351df35642
...
ec0995d08a
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ec0995d08a | ||
|
|
a124651a7e | ||
|
|
a7ddfcde2a | ||
|
|
43af924920 | ||
|
|
e9d225939a | ||
|
|
ff1c0e890c | ||
|
|
9fd3376557 | ||
|
|
8f35513063 | ||
|
|
f32aa61c0f | ||
|
|
4e8995eaed | ||
|
|
f1063146d2 | ||
|
|
992bab2887 | ||
|
|
eb32528f7e | ||
|
|
dcc6db2363 |
9
Dockerfile
Normal file
9
Dockerfile
Normal file
@@ -0,0 +1,9 @@
|
||||
FROM docker.m.daocloud.io/python:3.12-slim
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
ENV TZ=Asia/Shanghai \
|
||||
LANG=C.UTF-8
|
||||
RUN rm -rf logs .git .idea .venv && apt-get update && apt-get install -y vim curl sqlite3 && pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/
|
||||
RUN mkdir -p /app/uploads
|
||||
EXPOSE 3006
|
||||
CMD ["python", "main.py"]
|
||||
0
db/__init__.py
Normal file
0
db/__init__.py
Normal file
48
db/sql_db.py
Normal file
48
db/sql_db.py
Normal file
@@ -0,0 +1,48 @@
|
||||
from sqlalchemy import Column, DateTime, Integer, BigInteger, String, create_engine, Boolean, Text
|
||||
from sqlalchemy.orm import declarative_base, sessionmaker
|
||||
# 申明基类对象
|
||||
Base = declarative_base()
|
||||
from decouple import config
|
||||
|
||||
DB_PATH = config('DB_PATH', default='D://PycharmProject//yj_resume//main.sqlite3')
|
||||
|
||||
|
||||
class DBTASK(Base):
|
||||
__tablename__ = 'db_task'
|
||||
id = Column(String(100), primary_key=True)
|
||||
name = Column(String(100), nullable=False)
|
||||
create_time = Column(DateTime, nullable=False, )
|
||||
# 0 代表待执行,1 成功,2 失败
|
||||
status = Column(Integer, nullable=False, default=0)
|
||||
success_num = Column(Integer, nullable=False, default=0)
|
||||
total_num = Column(Integer, nullable=False, default=0)
|
||||
fail_num = Column(Integer, nullable=False, default=0)
|
||||
message = Column(Text, nullable=True)
|
||||
|
||||
|
||||
class DBRESUME(Base):
|
||||
__tablename__ = 'db_resume'
|
||||
id = Column(String(100), primary_key=True)
|
||||
# 每个任务对应一个文件夹ID
|
||||
task_id = Column(String(100), nullable=False)
|
||||
# 0 代表待执行,1 成功,2 失败
|
||||
status = Column(Integer, nullable=False, default=0)
|
||||
file_name = Column(String(100), nullable=True)
|
||||
# 可以用json表示提取的数据
|
||||
data_info = Column(Text, nullable=True)
|
||||
# 错误信息等
|
||||
message = Column(Text, nullable=True)
|
||||
|
||||
|
||||
class SqliteSqlalchemy(object):
|
||||
def __init__(self):
|
||||
# 创建sqlite连接引擎
|
||||
engine = create_engine(f'sqlite:///{DB_PATH}', echo=True)
|
||||
# 创建表
|
||||
Base.metadata.create_all(engine, checkfirst=True)
|
||||
# 创建sqlite的session连接对象
|
||||
self.session = sessionmaker(bind=engine)()
|
||||
|
||||
|
||||
|
||||
|
||||
62
logging_config.py
Normal file
62
logging_config.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# logging_config.py
|
||||
import logging
|
||||
import logging.config
|
||||
from pathlib import Path
|
||||
|
||||
# 确保 logs 目录存在
|
||||
log_dir = Path("logs")
|
||||
log_dir.mkdir(exist_ok=True)
|
||||
|
||||
LOGGING_CONFIG = {
|
||||
"version": 1,
|
||||
"disable_existing_loggers": False,
|
||||
"formatters": {
|
||||
"default": {
|
||||
"format": "%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s",
|
||||
},
|
||||
"detailed": {
|
||||
"format": "%(asctime)s - %(name)s - %(levelname)s - %(funcName)s - %(message)s",
|
||||
}
|
||||
},
|
||||
"handlers": {
|
||||
"console": {
|
||||
"class": "logging.StreamHandler",
|
||||
"level": "INFO",
|
||||
"formatter": "default",
|
||||
"stream": "ext://sys.stdout"
|
||||
},
|
||||
"file": {
|
||||
"class": "logging.handlers.RotatingFileHandler", # 自动轮转
|
||||
"level": "INFO",
|
||||
"formatter": "detailed",
|
||||
"filename": "logs/resume.log",
|
||||
"maxBytes": 10485760, # 10MB
|
||||
"backupCount": 5, # 保留5个备份
|
||||
"encoding": "utf8"
|
||||
},
|
||||
},
|
||||
"root": {
|
||||
"level": "INFO",
|
||||
"handlers": ["console", "file"]
|
||||
},
|
||||
"loggers": {
|
||||
"uvicorn": {
|
||||
"level": "INFO",
|
||||
"handlers": ["console", "file"],
|
||||
"propagate": False
|
||||
},
|
||||
"uvicorn.error": {
|
||||
"level": "INFO",
|
||||
"handlers": ["console", "file"],
|
||||
"propagate": False
|
||||
},
|
||||
"uvicorn.access": {
|
||||
"level": "WARNING", # 只记录警告以上,避免刷屏
|
||||
"handlers": ["file"], # 只写入文件
|
||||
"propagate": False
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# 应用配置
|
||||
logging.config.dictConfig(LOGGING_CONFIG)
|
||||
45
main.py
45
main.py
@@ -1,5 +1,17 @@
|
||||
from fastapi import FastAPI
|
||||
import uvicorn
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, File, UploadFile, HTTPException
|
||||
from typing import List
|
||||
from service.file_service import check_and_create_directory, upload_and_save_file,fetch_files
|
||||
from service import excel_service
|
||||
from service.db_service import get_task_list
|
||||
from fastapi.responses import FileResponse
|
||||
import threading
|
||||
from logging_config import LOGGING_CONFIG
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
|
||||
@@ -8,7 +20,36 @@ def read_root():
|
||||
return {"Hello": "World"}
|
||||
|
||||
|
||||
# 上传文件并解析,解析是异步错误
|
||||
@app.post("/upload_files_and_parse")
|
||||
async def create_upload_files(files: List[UploadFile] = File(...)):
|
||||
dir_id = check_and_create_directory(files)
|
||||
if not dir_id:
|
||||
return {"result": False, "code": 500, "message": "create directory failed"}
|
||||
flag, message= await upload_and_save_file(dir_id, files)
|
||||
logger.info(f"flag is {flag}")
|
||||
if flag:
|
||||
flag,message = await fetch_files(dir_id)
|
||||
return {"result": flag, "message": message,"task_id": dir_id}
|
||||
|
||||
|
||||
@app.get("/export_task_data_to_excel")
|
||||
def export_task_data_to_excel(task_id: str):
|
||||
path_xx = excel_service.export_task_data_to_excel(task_id)
|
||||
if not path_xx:
|
||||
raise HTTPException(status_code=404, detail="file not found")
|
||||
return FileResponse(
|
||||
path=path_xx,
|
||||
media_type="application/octet-stream", # 通用二进制流
|
||||
filename=f"{task_id}.xlsx" # 浏览器下载时使用的文件名
|
||||
)
|
||||
|
||||
|
||||
@app.get("/parse_task_list")
|
||||
def parse_task_list():
|
||||
data = get_task_list()
|
||||
return {"data": data, "code": 200, }
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
uvicorn.run(app, host="127.0.0.1", port=3006)
|
||||
uvicorn.run(app, host="127.0.0.1", port=3006)
|
||||
|
||||
@@ -1,3 +1,18 @@
|
||||
python-docx== 0.8.11
|
||||
python-docx
|
||||
fastapi
|
||||
uvicorn
|
||||
uvicorn
|
||||
docxtpl
|
||||
SQLAlchemy
|
||||
python-decouple
|
||||
python-multipart
|
||||
pandas
|
||||
openpyxl
|
||||
|
||||
python-multipart
|
||||
PyMuPDF>=1.23.0
|
||||
paddlepaddle>=2.5.0
|
||||
paddleocr>=2.7.0.3
|
||||
opencv-python>=4.8.0
|
||||
numpy>=1.24.0
|
||||
pdf2image>=1.16.3
|
||||
Pillow>=10.0.0
|
||||
0
service/__init__.py
Normal file
0
service/__init__.py
Normal file
18
service/db_service.py
Normal file
18
service/db_service.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy
|
||||
|
||||
|
||||
def get_task_list():
|
||||
task_list = SqliteSqlalchemy().session.query(DBTASK).order_by(DBTASK.create_time.desc()).all()
|
||||
result=[]
|
||||
for task in task_list:
|
||||
result.append({
|
||||
"id": task.id,
|
||||
"name": task.name,
|
||||
"success_num": task.success_num,
|
||||
"fail_num": task.fail_num,
|
||||
"status": task.status,
|
||||
"total_num": task.total_num,
|
||||
"message": task.message,
|
||||
"create_time": task.create_time.strftime("%Y-%m-%d %H:%M:%S") if task.create_time else None,
|
||||
})
|
||||
return result
|
||||
35
service/excel_service.py
Normal file
35
service/excel_service.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy
|
||||
import json
|
||||
import pandas as pd
|
||||
import pathlib
|
||||
from decouple import config
|
||||
|
||||
# BASE_PATH = config('BASE_PATH', default='E://pyptoject//yj_resume//')
|
||||
BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//')
|
||||
|
||||
|
||||
# 导出数据到excel
|
||||
def export_to_excel(task_id):
|
||||
# 获取所有成功的信息
|
||||
list_data = SqliteSqlalchemy().session.query(DBRESUME).filter_by(task_id=task_id, status=1).all()
|
||||
pd_data = []
|
||||
for data in list_data:
|
||||
pd_data.append(json.loads(data.data_info))
|
||||
data_frame = pd.DataFrame(pd_data)
|
||||
# 导出到excel
|
||||
pathxx = pathlib.Path(BASE_PATH).joinpath(task_id)
|
||||
pathxx = pathxx.joinpath(f"{task_id}.xlsx")
|
||||
data_frame.to_excel(pathxx, index=False)
|
||||
|
||||
|
||||
def export_task_data_to_excel(task_id):
|
||||
pathxx = pathlib.Path(BASE_PATH).joinpath(task_id)
|
||||
pathxx = pathxx.joinpath(f"{task_id}.xlsx")
|
||||
if pathxx.exists():
|
||||
return pathxx
|
||||
session = SqliteSqlalchemy().session
|
||||
task = session.query(DBTASK).filter_by(id=task_id).first()
|
||||
if not task or task.status == 0 or task.status == 2:
|
||||
return None
|
||||
export_to_excel(task_id)
|
||||
return pathxx
|
||||
139
service/file_service.py
Normal file
139
service/file_service.py
Normal file
@@ -0,0 +1,139 @@
|
||||
import json
|
||||
|
||||
from pymupdf import message
|
||||
|
||||
from db.sql_db import DBTASK, DBRESUME, SqliteSqlalchemy
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from decouple import config
|
||||
import pathlib
|
||||
from fastapi import File, UploadFile
|
||||
from typing import List
|
||||
import os
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from service.parse_resume2_doc import extra_resume
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
BASE_PATH = config('BASE_PATH', default='D://PycharmProject//yj_resume//')
|
||||
|
||||
|
||||
|
||||
|
||||
def check_and_create_directory(files):
|
||||
logger.info("check_and_create_directory in service")
|
||||
# 先创建一个task
|
||||
if not files or len(files) == 0:
|
||||
return None
|
||||
id = str(uuid.uuid4())
|
||||
current_time = datetime.now()
|
||||
# 格式化时间为字符串
|
||||
formatted_time = current_time.strftime("%Y-%m-%d-%H-%M-%S")
|
||||
task = DBTASK(id=id, create_time=datetime.now(), status=0, success_num=0, total_num=len(files),
|
||||
fail_num=0,name=f"解析任务({formatted_time})")
|
||||
|
||||
session = SqliteSqlalchemy().session
|
||||
try:
|
||||
session.add(task)
|
||||
session.commit()
|
||||
except Exception as e:
|
||||
print(f"Failed to save DBTASK info error {e}")
|
||||
session.rollback()
|
||||
return None
|
||||
finally:
|
||||
session.close()
|
||||
return id
|
||||
|
||||
|
||||
async def upload_and_save_file(dir_id, files: List[UploadFile]) -> (bool, str):
|
||||
logger.info(f"upload_and_save_file in service dir_id {dir_id}")
|
||||
pathxx = pathlib.Path(BASE_PATH).joinpath(dir_id)
|
||||
pathxx.mkdir(parents=True, exist_ok=True)
|
||||
data = []
|
||||
for file in files:
|
||||
name, fix = os.path.splitext(file.filename)
|
||||
id = str(uuid.uuid4())
|
||||
if fix not in ['.doc', '.docx']:
|
||||
continue
|
||||
with open(pathxx.joinpath(id + fix), 'wb') as f:
|
||||
file_content = await file.read()
|
||||
f.write(file_content)
|
||||
|
||||
data.append(DBRESUME(id=id, task_id=dir_id, status=0, file_name=id + fix))
|
||||
session = SqliteSqlalchemy().session
|
||||
try:
|
||||
session.bulk_save_objects(data)
|
||||
session.commit()
|
||||
except Exception as e:
|
||||
print(f"Failed to save DBRESUME error {e}")
|
||||
session.rollback()
|
||||
return False, f"Failed to save DBRESUME error {e}",[]
|
||||
finally:
|
||||
session.close()
|
||||
return True, "success"
|
||||
|
||||
async def fetch_files(dir_id) -> (bool, str):
|
||||
|
||||
logger.info(f"start fetching files task {dir_id} in service")
|
||||
if not os.path.exists(BASE_PATH):
|
||||
logger.info(f"目录{BASE_PATH}不存在")
|
||||
return None
|
||||
file_extensions = ['.docx', '.doc']
|
||||
files_list = []
|
||||
dir_path = pathlib.Path(BASE_PATH).joinpath(dir_id)
|
||||
for root,dirs,files in os.walk(dir_path):
|
||||
for file in files:
|
||||
_,ext = os.path.splitext(file)
|
||||
if file_extensions and ext not in file_extensions:
|
||||
logger.error(f"文件{file}格式不符合预期")
|
||||
continue
|
||||
file_path = os.path.join(root,file)
|
||||
if os.path.isfile(file_path):
|
||||
files_list.append(file_path)
|
||||
else:
|
||||
logger.error(f"路径下{file_path}不是文件")
|
||||
update_success_mapping = []
|
||||
update_fail_mapping = []
|
||||
for file in files_list:
|
||||
logger.info(f"file is {file} {os.path.basename(file)}")
|
||||
file_name = os.path.basename(file)
|
||||
id = os.path.splitext(file_name)[0]
|
||||
result = extra_resume(file)
|
||||
result = json.dumps(result, ensure_ascii=False)
|
||||
logger.info(f"result type is {type(result)}")
|
||||
logger.info(f"file content is {result}")
|
||||
if not result:
|
||||
logger.warning(f"file {file_name} 提取为空")
|
||||
update_fail_mapping.append({'id':id, 'status':0,
|
||||
'message': f"task {dir_id} => file {file_name} 提取为空"})
|
||||
continue
|
||||
update_success_mapping.append({'id':id, 'status':1,'data_info': result})
|
||||
session = SqliteSqlalchemy().session
|
||||
logger.info(f"update success mapping => {update_success_mapping}")
|
||||
logger.info(f"update fail mapping => {update_fail_mapping}")
|
||||
success_num = len(update_success_mapping)
|
||||
fail_num = len(update_fail_mapping)
|
||||
try:
|
||||
update_data = update_success_mapping + update_fail_mapping
|
||||
session.bulk_update_mappings(DBRESUME, update_data)
|
||||
|
||||
if update_fail_mapping:
|
||||
session.bulk_update_mappings(DBTASK, [{'id':dir_id, 'status':2, 'success_num':success_num,
|
||||
'fail_num':fail_num,'message':f'fail => {update_fail_mapping}'}])
|
||||
else:
|
||||
session.bulk_update_mappings(DBTASK, [{'id': dir_id, 'status': 1,
|
||||
'success_num': success_num, 'fail_num': fail_num}])
|
||||
session.commit()
|
||||
except Exception as e:
|
||||
logger.error(f"update failed => task {dir_id} error {e}")
|
||||
session.rollback()
|
||||
return False, f"Failed to update DBRESUME error {e}"
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
return True, 'success'
|
||||
|
||||
|
||||
|
||||
|
||||
35
service/format_template_resume.py
Normal file
35
service/format_template_resume.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from docxtpl import DocxTemplate
|
||||
from pathlib import Path
|
||||
|
||||
context = {
|
||||
'name': '张三',
|
||||
'sex': '男',
|
||||
'nation': '汉族',
|
||||
'brith': '1990-01-01',
|
||||
'address': '北京市海淀区西二旗',
|
||||
'education': '本科',
|
||||
'degree': '学士',
|
||||
# 籍贯
|
||||
'origin': '山东',
|
||||
'politics': '党员',
|
||||
# 部门
|
||||
'department': '数信部',
|
||||
'position': '助理开发工程师',
|
||||
'phone': '13812345678',
|
||||
'title': '后端开发工程师',
|
||||
'start_work_time': '2018-01-01',
|
||||
# 身份证
|
||||
'id_number': '500221199001010010101',
|
||||
# 荣誉,交给大模型
|
||||
'honor': '一等奖',
|
||||
# 工作内容
|
||||
'work_text': '''
|
||||
2023.12-2024.10:负责《边缘计算+5G自组网的水电物联网系统建设与研究》项目异常检测算法和项目实施:利用5G自组网技术、自建边缘计算单元等,实际实现在线异常检测、时间序列趋势分析、模型轻量化等功能,缓解通信带宽压力;在观音岩、彭水、渝能等场站实施应用。完成项目科技成果凝练、项目报奖等工作。本项目工作获得第六届全国设备管理与技术创新成果一等奖、中电联职工创新成果二等奖。
|
||||
2024.04-2025.至今:广西河池源网荷储一体化项目/大唐西藏玉曲河扎拉电厂可行性研究报告&方案编写、AI支持中心方案策划
|
||||
'''
|
||||
|
||||
}
|
||||
file_path = Path.cwd().joinpath('template.docx')
|
||||
template = DocxTemplate(file_path)
|
||||
template.render(context)
|
||||
template.save('E://resu//output.docx')
|
||||
384
service/parse_resume2_doc.py
Normal file
384
service/parse_resume2_doc.py
Normal file
@@ -0,0 +1,384 @@
|
||||
import re
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from docx import Document
|
||||
from typing import Dict, List, Any, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
class EnhancedDocxExtractor:
|
||||
def __init__(self):
|
||||
# 定义字段名称的多种变体
|
||||
self.field_variants = {
|
||||
'姓名': ['姓名', '姓 名', '姓 名', '姓名:', '姓 名:','姓 名'],
|
||||
'性别': ['性别', '性 别', '性 别', '性别:', '性 别:','性 别'],
|
||||
'出生年月': ['出生年月', '出生年月:', '出生日期', '出生日期:'],
|
||||
'民族': ['民族', '民族:', '民 族'],
|
||||
'政治面貌': ['政治面貌', '政治面貌:', '政治面貌:'],
|
||||
'现任职单位及部门': ['现任职单位及部门', '单位及部门', '工作单位', '现任职单位'],
|
||||
'联系电话': ['联系电话', '电话', '手机', '联系电话:', '手机号'],
|
||||
'联系地址': ['联系地址', '地址', '联系地址:', '家庭地址'],
|
||||
'学历学位': ['学历', '学历:', '学 历', '学历\n学位','学位','学位:','学 位'],
|
||||
'毕业院校': ['毕业院校', '毕业学校', '毕业院校:','毕业院校系及专业'],
|
||||
'专业': ['专业', '专业:', '系及专业', '所学专业'],
|
||||
}
|
||||
|
||||
def extract_with_table_structure(self, docx_path: str) -> Dict[str, Any]:
|
||||
"""
|
||||
提取 .docx 中的表格结构数据
|
||||
"""
|
||||
doc = Document(docx_path)
|
||||
results = defaultdict(dict)
|
||||
# 分析每个表格
|
||||
for table_idx, table in enumerate(doc.tables):
|
||||
print(f"\n处理表格 {table_idx + 1} ({len(table.rows)}行 × {len(table.columns)}列)")
|
||||
|
||||
# 获取表格结构
|
||||
table_structure = self._analyze_table_structure(table)
|
||||
# 提取键值对
|
||||
kv_pairs = self._extract_from_table_structure(table, table_structure)
|
||||
# 分类存储
|
||||
for key, value in kv_pairs:
|
||||
category = self._categorize_field(key)
|
||||
results[category][key] = value
|
||||
# 提取段落中的信息
|
||||
paragraph_info = self._extract_from_paragraphs(doc.paragraphs)
|
||||
for key, value in paragraph_info:
|
||||
category = self._categorize_field(key)
|
||||
results[category][key] = value
|
||||
|
||||
return dict(results)
|
||||
|
||||
def _analyze_table_structure(self, table) -> List[List[Dict]]:
|
||||
"""
|
||||
分析表格结构,返回每个单元格的元信息
|
||||
"""
|
||||
structure = []
|
||||
|
||||
for row_idx, row in enumerate(table.rows):
|
||||
row_info = []
|
||||
for col_idx, cell in enumerate(row.cells):
|
||||
cell_text = cell.text.strip()
|
||||
# 分析单元格属性
|
||||
cell_info = {
|
||||
'text': cell_text,
|
||||
'row': row_idx,
|
||||
'col': col_idx,
|
||||
'rowspan': 1,
|
||||
'colspan': 1,
|
||||
'is_key': self._is_likely_key(cell_text),
|
||||
'is_value': self._is_likely_value(cell_text),
|
||||
}
|
||||
row_info.append(cell_info)
|
||||
structure.append(row_info)
|
||||
|
||||
return structure
|
||||
|
||||
def _extract_from_table_structure(self, table, structure) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
从表格结构中提取键值对
|
||||
"""
|
||||
kv_pairs = []
|
||||
visited = set()
|
||||
key_recode = []
|
||||
for row_idx, row in enumerate(structure):
|
||||
for col_idx, cell in enumerate(row):
|
||||
print(f"visited is {visited} ")
|
||||
print(f'row {row_idx} col {col_idx} all cell is {cell}')
|
||||
if (row_idx, col_idx) in visited:
|
||||
print(f'---{row_idx}, {col_idx} ')
|
||||
print(f'cell is {cell}')
|
||||
continue
|
||||
|
||||
if cell['is_key']:
|
||||
next_cell = structure[row_idx][col_idx+1]
|
||||
# 寻找对应的值
|
||||
print(f"cell2 is {cell} row {row_idx} col {col_idx}")
|
||||
value = self._find_value_for_key(table, structure, row_idx, col_idx, visited, kv_pairs)
|
||||
if value:
|
||||
key = self._normalize_key(cell['text'])
|
||||
found = False
|
||||
kv_pairs = [(k,v+","+value)if k == cell['text'] else (k, v) for k,v in kv_pairs ]
|
||||
for i, (k,v) in enumerate(kv_pairs):
|
||||
if k == cell['text']:
|
||||
kv_pairs[i] = (k,value)
|
||||
found = True
|
||||
if not found:
|
||||
kv_pairs.append((key, value))
|
||||
|
||||
else:
|
||||
print("不是key")
|
||||
return kv_pairs
|
||||
|
||||
def _find_value_for_key(self, table, structure, key_row, key_col, visited, kv_pairs) -> str:
|
||||
"""
|
||||
为键找到对应的值
|
||||
"""
|
||||
# 尝试右侧单元格
|
||||
if key_col + 1 < len(structure[key_row]):
|
||||
value_cell = structure[key_row][key_col + 1]
|
||||
current_key_cell = structure[key_row][key_col]
|
||||
if value_cell['is_key']:
|
||||
return None
|
||||
# 特殊处理学历
|
||||
spec_coll = ['全日制教育','在职教育']
|
||||
if current_key_cell['text'].replace('\n','') in spec_coll :
|
||||
if not value_cell['text']:
|
||||
value_cell['text'] = 'False'
|
||||
else:
|
||||
value_cell['text'] = 'True'
|
||||
|
||||
if value_cell['text'] and (key_row, key_col + 1) not in visited:
|
||||
# 检查这个值是否与前一个键提取的值相同(可能是合并单元格)
|
||||
if not self._is_key_duplicate_merged_cell(structure[key_row][key_col]['text'], kv_pairs):
|
||||
print("前一个不重复")
|
||||
print(f"visited add {key_row} {key_col + 1}")
|
||||
visited.add((key_row, key_col + 1))
|
||||
return value_cell['text']
|
||||
else:
|
||||
current_key = structure[key_row][key_col]['text']
|
||||
print(f"key值重复------------------------------key {current_key}")
|
||||
for key, value in kv_pairs:
|
||||
if key == current_key:
|
||||
return value+","+value_cell['text']
|
||||
|
||||
|
||||
# 尝试下方单元格
|
||||
if key_row + 1 < len(structure):
|
||||
value_cell = structure[key_row + 1][key_col]
|
||||
if value_cell['text'] and (key_row + 1, key_col) not in visited:
|
||||
# 检查这个值是否与前一个键提取的值相同(可能是合并单元格)
|
||||
if not self._is_key_duplicate_merged_cell(structure[key_row][key_col]['text'], kv_pairs):
|
||||
print("下一个不重复")
|
||||
print(f"visited add {key_row} {key_col + 1}")
|
||||
visited.add((key_row + 1, key_col))
|
||||
return value_cell['text']
|
||||
|
||||
# 尝试合并单元格的情况
|
||||
for row_idx in range(len(structure)):
|
||||
for col_idx in range(len(structure[row_idx])):
|
||||
cell = structure[row_idx][col_idx]
|
||||
if (row_idx, col_idx) not in visited and cell['text']:
|
||||
# 检查是否在键的附近
|
||||
if abs(row_idx - key_row) <= 2 and abs(col_idx - key_col) <= 2:
|
||||
# 检查这个值是否与前一个键提取的值相同
|
||||
if not self._is_key_duplicate_merged_cell(structure[key_row][key_col]['text'], kv_pairs):
|
||||
print("合并不重复")
|
||||
print(f"visited add {key_row} {key_col + 1}")
|
||||
visited.add((row_idx, col_idx))
|
||||
return cell['text']
|
||||
return None
|
||||
|
||||
|
||||
def _is_key_duplicate_merged_cell(self, text, kv_pairs) -> bool:
|
||||
"""
|
||||
检查当前文本value是否可能和已收录的kv集合里的key值重复
|
||||
如下例:1行0列 ,2行0列 都是毕业院校
|
||||
第一次 1行0列:1行2列组成key:value
|
||||
第二次到2行0列,检测到 毕业院校已出现在kv_pairs中,不再组合2行0列:2行1列
|
||||
| 硕士学位/研究生学历:中国科学院计算技术研究所计算机技术专业
|
||||
毕业院校 |——————————————————————————————————————————————————
|
||||
|
|
||||
|————————————————————————————————————————————————————
|
||||
"""
|
||||
|
||||
for k, v in kv_pairs:
|
||||
if text == k:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def extract_parentheses_content(self, text):
|
||||
# 使用正则表达式提取括号内的所有内容
|
||||
matches = re.findall(r'[((]([^))]*)[))]', text)
|
||||
|
||||
return matches # 返回列表,可能包含多个括号
|
||||
|
||||
def _is_likely_key(self, text: str) -> bool:
|
||||
"""判断文本是否可能是键"""
|
||||
if not text or len(text) > 20:
|
||||
return False
|
||||
|
||||
# 检查是否包含常见字段词
|
||||
key_indicators = ['籍贯','籍 贯','政治面貌','政治\n面貌','姓名','性别','姓 名', '性 别', '出生年月', '民族','民 族', '单位', '部门','联系地址','主要学习经历','全日制教育','在职教育',
|
||||
'职务','职 务','职\n务', '职称','职 称', '电话', '地址', '学历', '学位','现任职务','职业资格','奖惩情况(近三年主要奖惩信息)'
|
||||
'专业', '岗位', '经历', '时间', '资格','现任职单位及部门','身份证号','婚姻状况','健康状况','应聘岗位','应聘部门/岗位','毕业院校系及专业']
|
||||
|
||||
for indicator in key_indicators:
|
||||
translation_table = str.maketrans('', '', ' \t\n\r\f\v')
|
||||
indicator = indicator.translate(translation_table)
|
||||
text = text.translate(translation_table)
|
||||
if indicator in text:
|
||||
return True
|
||||
|
||||
# 检查是否有冒号(中文文档常用)
|
||||
if ':' in text or ':' in text:
|
||||
key_part = text.split(':')[0].split(':')[0]
|
||||
if any(indicator in key_part for indicator in key_indicators):
|
||||
return True
|
||||
|
||||
for indicator in key_indicators:
|
||||
print("indicator is ===============================", indicator)
|
||||
print("text is ===============================", text)
|
||||
translation_table = str.maketrans('', '', ' \t\n\r\f\v')
|
||||
indicator = indicator.translate(translation_table)
|
||||
text = text.translate(translation_table)
|
||||
clean_text = self.extract_parentheses_content(text)
|
||||
print(text)
|
||||
clean_indicator = self.extract_parentheses_content(indicator)
|
||||
print(indicator)
|
||||
if not clean_text:
|
||||
print("特殊匹配失败")
|
||||
return False
|
||||
if clean_indicator:
|
||||
print("开始匹配=========")
|
||||
clean_text = clean_text[0]
|
||||
clean_indicator = clean_indicator[0]
|
||||
if clean_indicator in clean_text:
|
||||
print(f"特殊情况匹配成功======={text}")
|
||||
return True
|
||||
else:
|
||||
print("继续匹配")
|
||||
continue
|
||||
|
||||
return False
|
||||
|
||||
def _is_likely_value(self, text: str) -> bool:
|
||||
"""判断文本是否可能是值"""
|
||||
if not text:
|
||||
return False
|
||||
|
||||
# 值通常不是常见的字段名
|
||||
if self._is_likely_key(text):
|
||||
return False
|
||||
|
||||
# 值可能包含特定内容
|
||||
if re.match(r'^\d{11}$', text): # 手机号
|
||||
return True
|
||||
if re.match(r'^\d{4}年', text): # 日期
|
||||
return True
|
||||
if len(text) > 10: # 长文本可能是值
|
||||
return True
|
||||
|
||||
return True
|
||||
|
||||
def _normalize_key(self, key_text: str) -> str:
|
||||
"""标准化键名"""
|
||||
# 移除冒号和空格
|
||||
key_text = re.sub(r'[::\s]+$', '', key_text)
|
||||
|
||||
# 映射到标准键名
|
||||
for std_key, variants in self.field_variants.items():
|
||||
for variant in variants:
|
||||
if variant == key_text or key_text in variant:
|
||||
return std_key
|
||||
|
||||
return key_text
|
||||
|
||||
def _categorize_field(self, key: str) -> str:
|
||||
"""将字段分类"""
|
||||
categories = {
|
||||
'基本信息': ['姓名', '性别', '出生年月', '民族', '政治面貌','学历学位','毕业院校系及专业','全日制教育','在职教育'
|
||||
'婚姻状况', '健康状况', '籍贯', '身份证号','联系电话','婚姻状况','健康状况','身份证号','联系电话(手机)','毕业院校系及专业','联系地址','主要学习经历','奖惩情况(近三年主要奖惩信息)'],
|
||||
'工作信息': ['现任职单位及部门', '现任职务', '职称', '职业资格',
|
||||
'参加工作时间', '职称取得时间','应聘部门/岗位','是否接受调剂职级/岗位','奖惩情况(近三年主要奖惩信息)'],
|
||||
}
|
||||
|
||||
for category, fields in categories.items():
|
||||
translation_table = str.maketrans('', '', ' \t\n\r\f\v')
|
||||
key = key.translate(translation_table)
|
||||
if key in fields:
|
||||
# print(f"filed is {fields} key is {key} ")
|
||||
return category
|
||||
|
||||
return '其他信息'
|
||||
|
||||
def _extract_from_paragraphs(self, paragraphs) -> List[Tuple[str, str]]:
|
||||
"""从段落中提取信息"""
|
||||
kv_pairs = []
|
||||
|
||||
for para in paragraphs:
|
||||
text = para.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# 尝试提取冒号分隔的键值对
|
||||
if ':' in text or ':' in text:
|
||||
separator = ':' if ':' in text else ':'
|
||||
parts = text.split(separator, 1)
|
||||
|
||||
if len(parts) == 2:
|
||||
key = parts[0].strip()
|
||||
value = parts[1].strip()
|
||||
|
||||
if self._is_likely_key(key) and value:
|
||||
normalized_key = self._normalize_key(key)
|
||||
kv_pairs.append((normalized_key, value))
|
||||
|
||||
return kv_pairs
|
||||
|
||||
|
||||
# 快速使用示例
|
||||
def quick_extract(docx_path: str):
|
||||
"""快速提取并显示结果"""
|
||||
extractor = EnhancedDocxExtractor()
|
||||
|
||||
try:
|
||||
result = extractor.extract_with_table_structure(docx_path)
|
||||
print("\n提取结果 (键值对格式):")
|
||||
print("=" * 60)
|
||||
|
||||
for category, fields in result.items():
|
||||
if fields:
|
||||
print(f"\n{category}:")
|
||||
for key, value in fields.items():
|
||||
print(f" {key}: {value}")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"提取失败: {e}")
|
||||
|
||||
|
||||
base_map = ['姓名','性别','籍贯','政治面貌','出生年月','身份证号','现居住地','民族','学历','学位','学历学位','特长','联系电话','联系电话(手机)',
|
||||
'婚姻状况','健康状况','毕业院校系及专业','主要学习经历','联系地址','入党/团时间','全日制教育','在职教育','奖惩情况(近三年主要奖惩信息)']
|
||||
work_map = ['参加工作时间','现任职单位及部门','职务','现任职务','职称','奖惩','工作经历','主要工作经历','职称取得时间','职业资格','应聘部门/岗位']
|
||||
other_map = ['工作经历','主要工作经历','职称取得时间','职业资格','应聘部门/岗位','是否接受调剂职级/岗位']
|
||||
|
||||
|
||||
|
||||
def fetch_info(data):
|
||||
map_word = base_map + work_map + other_map
|
||||
print("data is {0}".format(data))
|
||||
print("map_word is {0}".format(map_word))
|
||||
final_res = {}
|
||||
for key, value in data.items():
|
||||
translation_table = str.maketrans('', '', ' \t\n\r\f\v')
|
||||
clean_key = key.translate(translation_table)
|
||||
print(f"key is {clean_key} ")
|
||||
if clean_key in map_word:
|
||||
# clean_value = value.translate(translation_table)
|
||||
final_res[clean_key] = value
|
||||
|
||||
return final_res
|
||||
|
||||
|
||||
def extra_resume(file_path):
|
||||
result = quick_extract(file_path)
|
||||
print(result)
|
||||
base_data = result['基本信息']
|
||||
work_data = result['工作信息']
|
||||
other_data = result['其他信息']
|
||||
data = {}
|
||||
data.update(base_data)
|
||||
data.update(work_data)
|
||||
data.update(other_data)
|
||||
res = fetch_info(data)
|
||||
return res
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# # 使用方法
|
||||
# docx_file = "../1.报名登记表.docx" # 替换为你的文件
|
||||
# print(extra_resume(docx_file))
|
||||
|
||||
|
||||
BIN
service/template.docx
Normal file
BIN
service/template.docx
Normal file
Binary file not shown.
Reference in New Issue
Block a user