diff --git a/scripts/analyze_group_similarity.py b/scripts/analyze_group_similarity.py index b61167f70..5775a7121 100644 --- a/scripts/analyze_group_similarity.py +++ b/scripts/analyze_group_similarity.py @@ -60,9 +60,9 @@ def load_group_data(group_dir): for item in data: count = item["count"] - situations.extend([item["situation"]] * count) - styles.extend([item["style"]] * count) - combined.extend([f"{item['situation']} {item['style']}"] * count) + situations.extend([item["situation"]] * int(count)) + styles.extend([item["style"]] * int(count)) + combined.extend([f"{item['situation']} {item['style']}"] * int(count)) return situations, styles, combined, total_count diff --git a/src/chat/focus_chat/expressors/exprssion_learner.py b/src/chat/focus_chat/expressors/exprssion_learner.py index 8332fac09..ca980e89a 100644 --- a/src/chat/focus_chat/expressors/exprssion_learner.py +++ b/src/chat/focus_chat/expressors/exprssion_learner.py @@ -280,6 +280,21 @@ class ExpressionLearner: new_expr["last_active_time"] = current_time old_data.append(new_expr) + # 处理超限问题 + if len(old_data) > MAX_EXPRESSION_COUNT: + # 计算每个表达方式的权重(count的倒数,这样count越小的越容易被选中) + weights = [1 / (expr.get("count", 1) + 0.1) for expr in old_data] + # 归一化权重 + total_weight = sum(weights) + weights = [w / total_weight for w in weights] + + # 随机选择要移除的表达方式 + remove_count = len(old_data) - MAX_EXPRESSION_COUNT + remove_indices = random.choices(range(len(old_data)), weights=weights, k=remove_count) + # 从后往前删除,避免索引变化 + for idx in sorted(remove_indices, reverse=True): + old_data.pop(idx) + with open(file_path, "w", encoding="utf-8") as f: json.dump(old_data, f, ensure_ascii=False, indent=2) diff --git a/src/chat/heart_flow/observation/chatting_observation.py b/src/chat/heart_flow/observation/chatting_observation.py index 4df814a33..187d3534c 100644 --- a/src/chat/heart_flow/observation/chatting_observation.py +++ b/src/chat/heart_flow/observation/chatting_observation.py @@ -132,7 +132,10 @@ class ChattingObservation(Observation): # logger.debug(f"找到的锚定消息:find_msg: {find_msg}") break else: - similarity = difflib.SequenceMatcher(None, text, message["raw_message"]).ratio() + if message["raw_message"]: + similarity = difflib.SequenceMatcher(None, text, message["raw_message"]).ratio() + else: + similarity = difflib.SequenceMatcher(None, text, message["processed_plain_text"]).ratio() msg_list.append({"message": message, "similarity": similarity}) # logger.debug(f"对锚定消息检查:message: {message['processed_plain_text']},similarity: {similarity}") if not find_msg: diff --git a/src/config/config.py b/src/config/config.py index 480cf6550..6360b973a 100644 --- a/src/config/config.py +++ b/src/config/config.py @@ -47,7 +47,7 @@ TEMPLATE_DIR = "template" # 考虑到,实际上配置文件中的mai_version是不会自动更新的,所以采用硬编码 # 对该字段的更新,请严格参照语义化版本规范:https://semver.org/lang/zh-CN/ -MMC_VERSION = "0.7.2-snapshot.1" +MMC_VERSION = "0.7.3-snapshot.1" def update_config():