Files
Mofox-Core/src/chat/knowledge/utils/dyn_topk.py
2025-06-07 13:46:04 +08:00

48 lines
1.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from typing import List, Any, Tuple
def dyn_select_top_k(
score: List[Tuple[Any, float]], jmp_factor: float, var_factor: float
) -> List[Tuple[Any, float, float]]:
"""动态TopK选择"""
# 按照分数排序(降序)
sorted_score = sorted(score, key=lambda x: x[1], reverse=True)
# 归一化
max_score = sorted_score[0][1]
min_score = sorted_score[-1][1]
normalized_score = []
for score_item in sorted_score:
normalized_score.append(
tuple(
[
score_item[0],
score_item[1],
(score_item[1] - min_score) / (max_score - min_score),
]
)
)
# 寻找跳变点score变化最大的位置
jump_idx = 0
for i in range(1, len(normalized_score)):
if abs(normalized_score[i][2] - normalized_score[i - 1][2]) > abs(
normalized_score[jump_idx][2] - normalized_score[jump_idx - 1][2]
):
jump_idx = i
# 跳变阈值
jump_threshold = normalized_score[jump_idx][2]
# 计算均值
mean_score = sum([s[2] for s in normalized_score]) / len(normalized_score)
# 计算方差
var_score = sum([(s[2] - mean_score) ** 2 for s in normalized_score]) / len(normalized_score)
# 动态阈值
threshold = jmp_factor * jump_threshold + (1 - jmp_factor) * (mean_score + var_factor * var_score)
# 重新过滤
res = [s for s in normalized_score if s[2] > threshold]
return res