Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
N
nbcb-dify-knowledge
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
nb-bank
nbcb-dify-knowledge
Commits
1e617112
Commit
1e617112
authored
Jul 16, 2025
by
管志勇
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
将内容分段,支持重叠长度控制,每段以alarm_name开头并添加换行,总长度不超过RERANK_MAX_LENGTH
parent
22e027a1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
14 deletions
+37
-14
sync-knowledge-to-dify-common.py
sync-knowledge-to-dify-common.py
+37
-14
No files found.
sync-knowledge-to-dify-common.py
View file @
1e617112
...
...
@@ -25,9 +25,9 @@ def log_message(message):
# 配置参数(需根据实际环境修改)
DIFY_API_BASE_URL
=
'http://1
92.168.141.145
/v1/datasets'
DATASET_ID
=
'a
b6b26f3-87ad-449d-99eb-55e9bc4f15b1
'
API_KEY
=
'dataset-
jPCfnGYrO3Jyyj9isGMjiyBe
'
DIFY_API_BASE_URL
=
'http://1
22.112.204.51
/v1/datasets'
DATASET_ID
=
'a
6f4574f-9fac-4b08-ad2f-673f8533e33d
'
API_KEY
=
'dataset-
YmAQDZbuEq5c4Q2KhSV7afyY
'
DB_CONFIG
=
{
'host'
:
'localhost'
,
'user'
:
'root'
,
...
...
@@ -36,7 +36,8 @@ DB_CONFIG = {
'charset'
:
'utf8mb4'
,
'cursorclass'
:
pymysql
.
cursors
.
DictCursor
,
}
SEGMENT_LENGTH
=
1000
# 分段大小
RERANK_MAX_LENGTH
=
510
# 分段最大长度(包括alarm_name)
OVERLAP_LENGTH
=
50
# 分段重叠长度
SYNC_STATE_FILE
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'sync_state_common.txt'
)
DOCUMENT_READY_TIMEOUT
=
10
# 创建就绪超时时间(秒)
POLLING_INTERVAL
=
1
# 轮询间隔(秒)
...
...
@@ -90,11 +91,11 @@ def fetch_records(sync_type, last_sync):
def
clean_solution_html
(
solution_html
):
"""清洗solution_html,保留img标签,其他标签替换为空"""
return
re
.
sub
(
r'<(?!img\b)[^>]*>'
,
''
,
solution_html
)
return
re
.
sub
(
r'
|
<(?!img\b)[^>]*>'
,
''
,
solution_html
)
def
split_content
(
alarm_name
,
content
):
"""将内容
按{SEGMENT_LENGTH}字分段,每段以alarm_name开头并添加换行
"""
"""将内容
分段,支持重叠长度控制,每段以alarm_name开头并添加换行,总长度不超过RERANK_MAX_LENGTH
"""
segments
=
[]
start
=
0
content_length
=
len
(
content
)
...
...
@@ -102,23 +103,45 @@ def split_content(alarm_name, content):
# 处理空报警名称的情况
if
not
alarm_name
:
alarm_name
=
""
# 计算alarm_name占用的长度(包含换行符)
header_length
=
len
(
alarm_name
)
+
1
# +1 是换行符的长度
# 计算内容部分的最大长度
max_content_length
=
RERANK_MAX_LENGTH
-
header_length
# 确保重叠长度不超过最大内容长度
effective_overlap
=
min
(
OVERLAP_LENGTH
,
max_content_length
)
# 如果报警名称太长,直接返回报警名称作为一个分段
if
header_length
>
RERANK_MAX_LENGTH
:
segments
.
append
(
alarm_name
[:
RERANK_MAX_LENGTH
])
return
segments
while
start
<
content_length
:
end
=
start
+
SEGMENT_LENGTH
# 处理最后一段
# 计算本次分段的结束位置
end
=
start
+
max_content_length
# 如果是最后一段,直接截取到末尾
if
end
>=
content_length
:
segment
=
content
[
start
:]
segment
_content
=
content
[
start
:]
# 构造以alarm_name开头的分段
formatted_segment
=
f
"{alarm_name}
\n
{segment}"
formatted_segment
=
f
"{alarm_name}
\n
{segment
_content
}"
segments
.
append
(
formatted_segment
)
break
# 截取当前分段
segment
=
content
[
start
:
end
]
segment
_content
=
content
[
start
:
end
]
# 构造以alarm_name开头的分段
formatted_segment
=
f
"{alarm_name}
\n
{segment}"
formatted_segment
=
f
"{alarm_name}
\n
{segment
_content
}"
segments
.
append
(
formatted_segment
)
start
=
end
# 计算下一段的起始位置(考虑重叠)
next_start
=
end
-
effective_overlap
# 防止重叠后起始位置回退
if
next_start
<=
start
:
next_start
=
start
+
1
# 至少前进1个字符,避免无限循环
start
=
next_start
# 过滤空分段
return
[
s
for
s
in
segments
if
s
.
strip
()]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment