Commit 1e617112 by 管志勇

将内容分段,支持重叠长度控制,每段以alarm_name开头并添加换行,总长度不超过RERANK_MAX_LENGTH

parent 22e027a1
......@@ -25,9 +25,9 @@ def log_message(message):
# 配置参数(需根据实际环境修改)
DIFY_API_BASE_URL = 'http://192.168.141.145/v1/datasets'
DATASET_ID = 'ab6b26f3-87ad-449d-99eb-55e9bc4f15b1'
API_KEY = 'dataset-jPCfnGYrO3Jyyj9isGMjiyBe'
DIFY_API_BASE_URL = 'http://122.112.204.51/v1/datasets'
DATASET_ID = 'a6f4574f-9fac-4b08-ad2f-673f8533e33d'
API_KEY = 'dataset-YmAQDZbuEq5c4Q2KhSV7afyY'
DB_CONFIG = {
'host': 'localhost',
'user': 'root',
......@@ -36,7 +36,8 @@ DB_CONFIG = {
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor,
}
SEGMENT_LENGTH = 1000 # 分段大小
RERANK_MAX_LENGTH = 510 # 分段最大长度(包括alarm_name)
OVERLAP_LENGTH = 50 # 分段重叠长度
SYNC_STATE_FILE = os.path.join(os.path.dirname(__file__), 'sync_state_common.txt')
DOCUMENT_READY_TIMEOUT = 10 # 创建就绪超时时间(秒)
POLLING_INTERVAL = 1 # 轮询间隔(秒)
......@@ -90,11 +91,11 @@ def fetch_records(sync_type, last_sync):
def clean_solution_html(solution_html):
"""清洗solution_html,保留img标签,其他标签替换为空"""
return re.sub(r'<(?!img\b)[^>]*>', '', solution_html)
return re.sub(r'&nbsp;|<(?!img\b)[^>]*>', '', solution_html)
def split_content(alarm_name, content):
"""将内容按{SEGMENT_LENGTH}字分段,每段以alarm_name开头并添加换行"""
"""将内容分段,支持重叠长度控制,每段以alarm_name开头并添加换行,总长度不超过RERANK_MAX_LENGTH"""
segments = []
start = 0
content_length = len(content)
......@@ -102,23 +103,45 @@ def split_content(alarm_name, content):
# 处理空报警名称的情况
if not alarm_name:
alarm_name = ""
# 计算alarm_name占用的长度(包含换行符)
header_length = len(alarm_name) + 1 # +1 是换行符的长度
# 计算内容部分的最大长度
max_content_length = RERANK_MAX_LENGTH - header_length
# 确保重叠长度不超过最大内容长度
effective_overlap = min(OVERLAP_LENGTH, max_content_length)
# 如果报警名称太长,直接返回报警名称作为一个分段
if header_length > RERANK_MAX_LENGTH:
segments.append(alarm_name[:RERANK_MAX_LENGTH])
return segments
while start < content_length:
end = start + SEGMENT_LENGTH
# 处理最后一段
# 计算本次分段的结束位置
end = start + max_content_length
# 如果是最后一段,直接截取到末尾
if end >= content_length:
segment = content[start:]
segment_content = content[start:]
# 构造以alarm_name开头的分段
formatted_segment = f"{alarm_name}\n{segment}"
formatted_segment = f"{alarm_name}\n{segment_content}"
segments.append(formatted_segment)
break
# 截取当前分段
segment = content[start:end]
segment_content = content[start:end]
# 构造以alarm_name开头的分段
formatted_segment = f"{alarm_name}\n{segment}"
formatted_segment = f"{alarm_name}\n{segment_content}"
segments.append(formatted_segment)
start = end
# 计算下一段的起始位置(考虑重叠)
next_start = end - effective_overlap
# 防止重叠后起始位置回退
if next_start <= start:
next_start = start + 1 # 至少前进1个字符,避免无限循环
start = next_start
# 过滤空分段
return [s for s in segments if s.strip()]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment