ruff
This commit is contained in:
@@ -9,13 +9,15 @@ import sqlite3
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def clean_group_name(name: str) -> str:
|
||||
"""清理群组名称,只保留中文和英文字符"""
|
||||
cleaned = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', '', name)
|
||||
cleaned = re.sub(r"[^\u4e00-\u9fa5a-zA-Z]", "", name)
|
||||
if not cleaned:
|
||||
cleaned = datetime.now().strftime("%Y%m%d")
|
||||
return cleaned
|
||||
|
||||
|
||||
def get_group_name(stream_id: str) -> str:
|
||||
"""从数据库中获取群组名称"""
|
||||
conn = sqlite3.connect("data/maibot.db")
|
||||
@@ -43,6 +45,7 @@ def get_group_name(stream_id: str) -> str:
|
||||
return clean_group_name(f"{platform}{stream_id[:8]}")
|
||||
return stream_id
|
||||
|
||||
|
||||
def format_timestamp(timestamp: float) -> str:
|
||||
"""将时间戳转换为可读的时间格式"""
|
||||
if not timestamp:
|
||||
@@ -50,132 +53,140 @@ def format_timestamp(timestamp: float) -> str:
|
||||
try:
|
||||
dt = datetime.fromtimestamp(timestamp)
|
||||
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except:
|
||||
except Exception as e:
|
||||
print(f"时间戳格式化错误: {e}")
|
||||
return "未知"
|
||||
|
||||
|
||||
def load_expressions(chat_id: str) -> List[Dict]:
|
||||
"""加载指定群聊的表达方式"""
|
||||
style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
|
||||
|
||||
|
||||
style_exprs = []
|
||||
|
||||
|
||||
if os.path.exists(style_file):
|
||||
with open(style_file, "r", encoding="utf-8") as f:
|
||||
style_exprs = json.load(f)
|
||||
|
||||
|
||||
return style_exprs
|
||||
|
||||
|
||||
def find_similar_expressions(expressions: List[Dict], top_k: int = 5) -> Dict[str, List[Tuple[str, float]]]:
|
||||
"""找出每个表达方式最相似的top_k个表达方式"""
|
||||
if not expressions:
|
||||
return {}
|
||||
|
||||
|
||||
# 分别准备情景和表达方式的文本数据
|
||||
situations = [expr['situation'] for expr in expressions]
|
||||
styles = [expr['style'] for expr in expressions]
|
||||
|
||||
situations = [expr["situation"] for expr in expressions]
|
||||
styles = [expr["style"] for expr in expressions]
|
||||
|
||||
# 使用TF-IDF向量化
|
||||
vectorizer = TfidfVectorizer()
|
||||
situation_matrix = vectorizer.fit_transform(situations)
|
||||
style_matrix = vectorizer.fit_transform(styles)
|
||||
|
||||
|
||||
# 计算余弦相似度
|
||||
situation_similarity = cosine_similarity(situation_matrix)
|
||||
style_similarity = cosine_similarity(style_matrix)
|
||||
|
||||
|
||||
# 对每个表达方式找出最相似的top_k个
|
||||
similar_expressions = {}
|
||||
for i, expr in enumerate(expressions):
|
||||
for i, _ in enumerate(expressions):
|
||||
# 获取相似度分数
|
||||
situation_scores = situation_similarity[i]
|
||||
style_scores = style_similarity[i]
|
||||
|
||||
|
||||
# 获取top_k的索引(排除自己)
|
||||
situation_indices = np.argsort(situation_scores)[::-1][1:top_k+1]
|
||||
style_indices = np.argsort(style_scores)[::-1][1:top_k+1]
|
||||
|
||||
situation_indices = np.argsort(situation_scores)[::-1][1 : top_k + 1]
|
||||
style_indices = np.argsort(style_scores)[::-1][1 : top_k + 1]
|
||||
|
||||
similar_situations = []
|
||||
similar_styles = []
|
||||
|
||||
|
||||
# 处理相似情景
|
||||
for idx in situation_indices:
|
||||
if situation_scores[idx] > 0: # 只保留有相似度的
|
||||
similar_situations.append((
|
||||
expressions[idx]['situation'],
|
||||
expressions[idx]['style'], # 添加对应的原始表达
|
||||
situation_scores[idx]
|
||||
))
|
||||
|
||||
similar_situations.append(
|
||||
(
|
||||
expressions[idx]["situation"],
|
||||
expressions[idx]["style"], # 添加对应的原始表达
|
||||
situation_scores[idx],
|
||||
)
|
||||
)
|
||||
|
||||
# 处理相似表达
|
||||
for idx in style_indices:
|
||||
if style_scores[idx] > 0: # 只保留有相似度的
|
||||
similar_styles.append((
|
||||
expressions[idx]['style'],
|
||||
expressions[idx]['situation'], # 添加对应的原始情景
|
||||
style_scores[idx]
|
||||
))
|
||||
|
||||
similar_styles.append(
|
||||
(
|
||||
expressions[idx]["style"],
|
||||
expressions[idx]["situation"], # 添加对应的原始情景
|
||||
style_scores[idx],
|
||||
)
|
||||
)
|
||||
|
||||
if similar_situations or similar_styles:
|
||||
similar_expressions[i] = {
|
||||
'situations': similar_situations,
|
||||
'styles': similar_styles
|
||||
}
|
||||
|
||||
similar_expressions[i] = {"situations": similar_situations, "styles": similar_styles}
|
||||
|
||||
return similar_expressions
|
||||
|
||||
|
||||
def main():
|
||||
# 获取所有群聊ID
|
||||
style_dirs = glob.glob(os.path.join("data", "expression", "learnt_style", "*"))
|
||||
chat_ids = [os.path.basename(d) for d in style_dirs]
|
||||
|
||||
|
||||
if not chat_ids:
|
||||
print("没有找到任何群聊的表达方式数据")
|
||||
return
|
||||
|
||||
|
||||
print("可用的群聊:")
|
||||
for i, chat_id in enumerate(chat_ids, 1):
|
||||
group_name = get_group_name(chat_id)
|
||||
print(f"{i}. {group_name}")
|
||||
|
||||
|
||||
while True:
|
||||
try:
|
||||
choice = int(input("\n请选择要分析的群聊编号 (输入0退出): "))
|
||||
if choice == 0:
|
||||
break
|
||||
if 1 <= choice <= len(chat_ids):
|
||||
chat_id = chat_ids[choice-1]
|
||||
chat_id = chat_ids[choice - 1]
|
||||
break
|
||||
print("无效的选择,请重试")
|
||||
except ValueError:
|
||||
print("请输入有效的数字")
|
||||
|
||||
|
||||
if choice == 0:
|
||||
return
|
||||
|
||||
|
||||
# 加载表达方式
|
||||
style_exprs = load_expressions(chat_id)
|
||||
|
||||
|
||||
group_name = get_group_name(chat_id)
|
||||
print(f"\n分析群聊 {group_name} 的表达方式:")
|
||||
|
||||
|
||||
similar_styles = find_similar_expressions(style_exprs)
|
||||
for i, expr in enumerate(style_exprs):
|
||||
if i in similar_styles:
|
||||
print("\n" + "-" * 20)
|
||||
print(f"表达方式:{expr['style']} <---> 情景:{expr['situation']}")
|
||||
|
||||
if similar_styles[i]['styles']:
|
||||
|
||||
if similar_styles[i]["styles"]:
|
||||
print("\n\033[33m相似表达:\033[0m")
|
||||
for similar_style, original_situation, score in similar_styles[i]['styles']:
|
||||
for similar_style, original_situation, score in similar_styles[i]["styles"]:
|
||||
print(f"\033[33m{similar_style},score:{score:.3f},对应情景:{original_situation}\033[0m")
|
||||
|
||||
if similar_styles[i]['situations']:
|
||||
|
||||
if similar_styles[i]["situations"]:
|
||||
print("\n\033[32m相似情景:\033[0m")
|
||||
for similar_situation, original_style, score in similar_styles[i]['situations']:
|
||||
for similar_situation, original_style, score in similar_styles[i]["situations"]:
|
||||
print(f"\033[32m{similar_situation},score:{score:.3f},对应表达:{original_style}\033[0m")
|
||||
|
||||
print(f"\n激活值:{expr.get('count', 1):.3f},上次激活时间:{format_timestamp(expr.get('last_active_time'))}")
|
||||
|
||||
print(
|
||||
f"\n激活值:{expr.get('count', 1):.3f},上次激活时间:{format_timestamp(expr.get('last_active_time'))}"
|
||||
)
|
||||
print("-" * 20)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -6,15 +6,17 @@ from datetime import datetime
|
||||
from typing import Dict, List, Any
|
||||
import sqlite3
|
||||
|
||||
|
||||
def clean_group_name(name: str) -> str:
|
||||
"""清理群组名称,只保留中文和英文字符"""
|
||||
# 提取中文和英文字符
|
||||
cleaned = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', '', name)
|
||||
cleaned = re.sub(r"[^\u4e00-\u9fa5a-zA-Z]", "", name)
|
||||
# 如果清理后为空,使用当前日期
|
||||
if not cleaned:
|
||||
cleaned = datetime.now().strftime("%Y%m%d")
|
||||
return cleaned
|
||||
|
||||
|
||||
def get_group_name(stream_id: str) -> str:
|
||||
"""从数据库中获取群组名称"""
|
||||
conn = sqlite3.connect("data/maibot.db")
|
||||
@@ -42,41 +44,44 @@ def get_group_name(stream_id: str) -> str:
|
||||
return clean_group_name(f"{platform}{stream_id[:8]}")
|
||||
return stream_id
|
||||
|
||||
|
||||
def load_expressions(chat_id: str) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
"""加载指定群组的表达方式"""
|
||||
learnt_style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
|
||||
learnt_grammar_file = os.path.join("data", "expression", "learnt_grammar", str(chat_id), "expressions.json")
|
||||
personality_file = os.path.join("data", "expression", "personality", "expressions.json")
|
||||
|
||||
|
||||
style_expressions = []
|
||||
grammar_expressions = []
|
||||
personality_expressions = []
|
||||
|
||||
|
||||
if os.path.exists(learnt_style_file):
|
||||
with open(learnt_style_file, "r", encoding="utf-8") as f:
|
||||
style_expressions = json.load(f)
|
||||
|
||||
|
||||
if os.path.exists(learnt_grammar_file):
|
||||
with open(learnt_grammar_file, "r", encoding="utf-8") as f:
|
||||
grammar_expressions = json.load(f)
|
||||
|
||||
|
||||
if os.path.exists(personality_file):
|
||||
with open(personality_file, "r", encoding="utf-8") as f:
|
||||
personality_expressions = json.load(f)
|
||||
|
||||
|
||||
return style_expressions, grammar_expressions, personality_expressions
|
||||
|
||||
|
||||
def format_time(timestamp: float) -> str:
|
||||
"""格式化时间戳为可读字符串"""
|
||||
return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
|
||||
def write_expressions(f, expressions: List[Dict[str, Any]], title: str):
|
||||
"""写入表达方式列表"""
|
||||
if not expressions:
|
||||
f.write(f"{title}:暂无数据\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
return
|
||||
|
||||
|
||||
f.write(f"{title}:\n")
|
||||
for expr in expressions:
|
||||
count = expr.get("count", 0)
|
||||
@@ -87,103 +92,111 @@ def write_expressions(f, expressions: List[Dict[str, Any]], title: str):
|
||||
f.write(f"最后活跃: {format_time(last_active)}\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
|
||||
def write_group_report(group_file: str, group_name: str, chat_id: str, style_exprs: List[Dict[str, Any]], grammar_exprs: List[Dict[str, Any]]):
|
||||
|
||||
def write_group_report(
|
||||
group_file: str,
|
||||
group_name: str,
|
||||
chat_id: str,
|
||||
style_exprs: List[Dict[str, Any]],
|
||||
grammar_exprs: List[Dict[str, Any]],
|
||||
):
|
||||
"""写入群组详细报告"""
|
||||
with open(group_file, "w", encoding="utf-8") as gf:
|
||||
gf.write(f"群组: {group_name} (ID: {chat_id})\n")
|
||||
gf.write("=" * 80 + "\n\n")
|
||||
|
||||
|
||||
# 写入语言风格
|
||||
gf.write("【语言风格】\n")
|
||||
gf.write("=" * 40 + "\n")
|
||||
write_expressions(gf, style_exprs, "语言风格")
|
||||
gf.write("\n")
|
||||
|
||||
|
||||
# 写入句法特点
|
||||
gf.write("【句法特点】\n")
|
||||
gf.write("=" * 40 + "\n")
|
||||
write_expressions(gf, grammar_exprs, "句法特点")
|
||||
|
||||
|
||||
def analyze_expressions():
|
||||
"""分析所有群组的表达方式"""
|
||||
# 获取所有群组ID
|
||||
style_dir = os.path.join("data", "expression", "learnt_style")
|
||||
chat_ids = [d for d in os.listdir(style_dir) if os.path.isdir(os.path.join(style_dir, d))]
|
||||
|
||||
|
||||
# 创建输出目录
|
||||
output_dir = "data/expression_analysis"
|
||||
personality_dir = os.path.join(output_dir, "personality")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
os.makedirs(personality_dir, exist_ok=True)
|
||||
|
||||
|
||||
# 生成时间戳
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
|
||||
# 创建总报告
|
||||
summary_file = os.path.join(output_dir, f"summary_{timestamp}.txt")
|
||||
with open(summary_file, "w", encoding="utf-8") as f:
|
||||
f.write(f"表达方式分析报告 - 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
|
||||
# 先处理人格表达
|
||||
personality_exprs = []
|
||||
personality_file = os.path.join("data", "expression", "personality", "expressions.json")
|
||||
if os.path.exists(personality_file):
|
||||
with open(personality_file, "r", encoding="utf-8") as pf:
|
||||
personality_exprs = json.load(pf)
|
||||
|
||||
|
||||
# 保存人格表达总数
|
||||
total_personality = len(personality_exprs)
|
||||
|
||||
|
||||
# 排序并取前20条
|
||||
personality_exprs.sort(key=lambda x: x.get("count", 0), reverse=True)
|
||||
personality_exprs = personality_exprs[:20]
|
||||
|
||||
|
||||
# 写入人格表达报告
|
||||
personality_report = os.path.join(personality_dir, f"expressions_{timestamp}.txt")
|
||||
with open(personality_report, "w", encoding="utf-8") as pf:
|
||||
pf.write("【人格表达方式】\n")
|
||||
pf.write("=" * 40 + "\n")
|
||||
write_expressions(pf, personality_exprs, "人格表达")
|
||||
|
||||
|
||||
# 写入总报告摘要中的人格表达部分
|
||||
f.write("【人格表达方式】\n")
|
||||
f.write("=" * 40 + "\n")
|
||||
f.write(f"人格表达总数: {total_personality} (显示前20条)\n")
|
||||
f.write(f"详细报告: {personality_report}\n")
|
||||
f.write("-" * 40 + "\n\n")
|
||||
|
||||
|
||||
# 处理各个群组的表达方式
|
||||
f.write("【群组表达方式】\n")
|
||||
f.write("=" * 40 + "\n\n")
|
||||
|
||||
|
||||
for chat_id in chat_ids:
|
||||
style_exprs, grammar_exprs, _ = load_expressions(chat_id)
|
||||
|
||||
|
||||
# 保存总数
|
||||
total_style = len(style_exprs)
|
||||
total_grammar = len(grammar_exprs)
|
||||
|
||||
|
||||
# 分别排序
|
||||
style_exprs.sort(key=lambda x: x.get("count", 0), reverse=True)
|
||||
grammar_exprs.sort(key=lambda x: x.get("count", 0), reverse=True)
|
||||
|
||||
|
||||
# 只取前20条
|
||||
style_exprs = style_exprs[:20]
|
||||
grammar_exprs = grammar_exprs[:20]
|
||||
|
||||
|
||||
# 获取群组名称
|
||||
group_name = get_group_name(chat_id)
|
||||
|
||||
|
||||
# 创建群组子目录(使用清理后的名称)
|
||||
safe_group_name = clean_group_name(group_name)
|
||||
group_dir = os.path.join(output_dir, f"{safe_group_name}_{chat_id}")
|
||||
os.makedirs(group_dir, exist_ok=True)
|
||||
|
||||
|
||||
# 写入群组详细报告
|
||||
group_file = os.path.join(group_dir, f"expressions_{timestamp}.txt")
|
||||
write_group_report(group_file, group_name, chat_id, style_exprs, grammar_exprs)
|
||||
|
||||
|
||||
# 写入总报告摘要
|
||||
f.write(f"群组: {group_name} (ID: {chat_id})\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
@@ -191,11 +204,12 @@ def analyze_expressions():
|
||||
f.write(f"句法特点总数: {total_grammar} (显示前20条)\n")
|
||||
f.write(f"详细报告: {group_file}\n")
|
||||
f.write("-" * 40 + "\n\n")
|
||||
|
||||
|
||||
print("分析报告已生成:")
|
||||
print(f"总报告: {summary_file}")
|
||||
print(f"人格表达报告: {personality_report}")
|
||||
print(f"各群组详细报告位于: {output_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_expressions()
|
||||
analyze_expressions()
|
||||
|
||||
@@ -71,14 +71,14 @@ def analyze_group_similarity():
|
||||
# 获取所有群组目录
|
||||
base_dir = Path("data/expression/learnt_style")
|
||||
group_dirs = [d for d in base_dir.iterdir() if d.is_dir()]
|
||||
|
||||
|
||||
# 加载所有群组的数据并过滤
|
||||
valid_groups = []
|
||||
valid_names = []
|
||||
valid_situations = []
|
||||
valid_styles = []
|
||||
valid_combined = []
|
||||
|
||||
|
||||
for d in group_dirs:
|
||||
situations, styles, combined, total_count = load_group_data(d)
|
||||
if total_count >= 50: # 只保留数据量大于等于50的群组
|
||||
@@ -87,11 +87,11 @@ def analyze_group_similarity():
|
||||
valid_situations.append(" ".join(situations))
|
||||
valid_styles.append(" ".join(styles))
|
||||
valid_combined.append(" ".join(combined))
|
||||
|
||||
|
||||
if not valid_groups:
|
||||
print("没有找到数据量大于等于50的群组")
|
||||
return
|
||||
|
||||
|
||||
# 创建TF-IDF向量化器
|
||||
vectorizer = TfidfVectorizer()
|
||||
|
||||
|
||||
@@ -3,117 +3,123 @@ import json
|
||||
import random
|
||||
from typing import List, Dict, Tuple
|
||||
import glob
|
||||
from datetime import datetime
|
||||
|
||||
MAX_EXPRESSION_COUNT = 300 # 每个群最多保留的表达方式数量
|
||||
MIN_COUNT_THRESHOLD = 0.01 # 最小使用次数阈值
|
||||
|
||||
|
||||
def load_expressions(chat_id: str) -> Tuple[List[Dict], List[Dict]]:
|
||||
"""加载指定群聊的表达方式"""
|
||||
style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
|
||||
grammar_file = os.path.join("data", "expression", "learnt_grammar", str(chat_id), "expressions.json")
|
||||
|
||||
|
||||
style_exprs = []
|
||||
grammar_exprs = []
|
||||
|
||||
|
||||
if os.path.exists(style_file):
|
||||
with open(style_file, "r", encoding="utf-8") as f:
|
||||
style_exprs = json.load(f)
|
||||
|
||||
|
||||
if os.path.exists(grammar_file):
|
||||
with open(grammar_file, "r", encoding="utf-8") as f:
|
||||
grammar_exprs = json.load(f)
|
||||
|
||||
|
||||
return style_exprs, grammar_exprs
|
||||
|
||||
|
||||
def save_expressions(chat_id: str, style_exprs: List[Dict], grammar_exprs: List[Dict]) -> None:
|
||||
"""保存表达方式到文件"""
|
||||
style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
|
||||
grammar_file = os.path.join("data", "expression", "learnt_grammar", str(chat_id), "expressions.json")
|
||||
|
||||
|
||||
os.makedirs(os.path.dirname(style_file), exist_ok=True)
|
||||
os.makedirs(os.path.dirname(grammar_file), exist_ok=True)
|
||||
|
||||
|
||||
with open(style_file, "w", encoding="utf-8") as f:
|
||||
json.dump(style_exprs, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
with open(grammar_file, "w", encoding="utf-8") as f:
|
||||
json.dump(grammar_exprs, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def cleanup_expressions(expressions: List[Dict]) -> List[Dict]:
|
||||
"""清理表达方式列表"""
|
||||
if not expressions:
|
||||
return []
|
||||
|
||||
|
||||
# 1. 移除使用次数过低的表达方式
|
||||
expressions = [expr for expr in expressions if expr.get("count", 0) > MIN_COUNT_THRESHOLD]
|
||||
|
||||
|
||||
# 2. 如果数量超过限制,随机删除多余的
|
||||
if len(expressions) > MAX_EXPRESSION_COUNT:
|
||||
# 按使用次数排序
|
||||
expressions.sort(key=lambda x: x.get("count", 0), reverse=True)
|
||||
|
||||
|
||||
# 保留前50%的高频表达方式
|
||||
keep_count = MAX_EXPRESSION_COUNT // 2
|
||||
keep_exprs = expressions[:keep_count]
|
||||
|
||||
|
||||
# 从剩余的表达方式中随机选择
|
||||
remaining_exprs = expressions[keep_count:]
|
||||
random.shuffle(remaining_exprs)
|
||||
keep_exprs.extend(remaining_exprs[:MAX_EXPRESSION_COUNT - keep_count])
|
||||
|
||||
keep_exprs.extend(remaining_exprs[: MAX_EXPRESSION_COUNT - keep_count])
|
||||
|
||||
expressions = keep_exprs
|
||||
|
||||
|
||||
return expressions
|
||||
|
||||
|
||||
def main():
|
||||
# 获取所有群聊ID
|
||||
style_dirs = glob.glob(os.path.join("data", "expression", "learnt_style", "*"))
|
||||
chat_ids = [os.path.basename(d) for d in style_dirs]
|
||||
|
||||
|
||||
if not chat_ids:
|
||||
print("没有找到任何群聊的表达方式数据")
|
||||
return
|
||||
|
||||
|
||||
print(f"开始清理 {len(chat_ids)} 个群聊的表达方式数据...")
|
||||
|
||||
|
||||
total_style_before = 0
|
||||
total_style_after = 0
|
||||
total_grammar_before = 0
|
||||
total_grammar_after = 0
|
||||
|
||||
|
||||
for chat_id in chat_ids:
|
||||
print(f"\n处理群聊 {chat_id}:")
|
||||
|
||||
|
||||
# 加载表达方式
|
||||
style_exprs, grammar_exprs = load_expressions(chat_id)
|
||||
|
||||
|
||||
# 记录清理前的数量
|
||||
style_count_before = len(style_exprs)
|
||||
grammar_count_before = len(grammar_exprs)
|
||||
total_style_before += style_count_before
|
||||
total_grammar_before += grammar_count_before
|
||||
|
||||
|
||||
# 清理表达方式
|
||||
style_exprs = cleanup_expressions(style_exprs)
|
||||
grammar_exprs = cleanup_expressions(grammar_exprs)
|
||||
|
||||
|
||||
# 记录清理后的数量
|
||||
style_count_after = len(style_exprs)
|
||||
grammar_count_after = len(grammar_exprs)
|
||||
total_style_after += style_count_after
|
||||
total_grammar_after += grammar_count_after
|
||||
|
||||
|
||||
# 保存清理后的表达方式
|
||||
save_expressions(chat_id, style_exprs, grammar_exprs)
|
||||
|
||||
|
||||
print(f"语言风格: {style_count_before} -> {style_count_after}")
|
||||
print(f"句法特点: {grammar_count_before} -> {grammar_count_after}")
|
||||
|
||||
|
||||
print("\n清理完成!")
|
||||
print(f"语言风格总数: {total_style_before} -> {total_style_after}")
|
||||
print(f"句法特点总数: {total_grammar_before} -> {total_grammar_after}")
|
||||
print(f"总共清理了 {total_style_before + total_grammar_before - total_style_after - total_grammar_after} 条表达方式")
|
||||
print(
|
||||
f"总共清理了 {total_style_before + total_grammar_before - total_style_after - total_grammar_after} 条表达方式"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
import json
|
||||
@@ -15,13 +16,15 @@ import random
|
||||
from src.llm_models.utils_model import LLMRequest
|
||||
from src.config.config import global_config
|
||||
|
||||
|
||||
def clean_group_name(name: str) -> str:
|
||||
"""清理群组名称,只保留中文和英文字符"""
|
||||
cleaned = re.sub(r'[^\u4e00-\u9fa5a-zA-Z]', '', name)
|
||||
cleaned = re.sub(r"[^\u4e00-\u9fa5a-zA-Z]", "", name)
|
||||
if not cleaned:
|
||||
cleaned = datetime.now().strftime("%Y%m%d")
|
||||
return cleaned
|
||||
|
||||
|
||||
def get_group_name(stream_id: str) -> str:
|
||||
"""从数据库中获取群组名称"""
|
||||
conn = sqlite3.connect("data/maibot.db")
|
||||
@@ -49,76 +52,79 @@ def get_group_name(stream_id: str) -> str:
|
||||
return clean_group_name(f"{platform}{stream_id[:8]}")
|
||||
return stream_id
|
||||
|
||||
|
||||
def load_expressions(chat_id: str) -> List[Dict]:
|
||||
"""加载指定群聊的表达方式"""
|
||||
style_file = os.path.join("data", "expression", "learnt_style", str(chat_id), "expressions.json")
|
||||
|
||||
|
||||
style_exprs = []
|
||||
|
||||
|
||||
if os.path.exists(style_file):
|
||||
with open(style_file, "r", encoding="utf-8") as f:
|
||||
style_exprs = json.load(f)
|
||||
|
||||
|
||||
# 如果表达方式超过10个,随机选择10个
|
||||
if len(style_exprs) > 50:
|
||||
style_exprs = random.sample(style_exprs, 50)
|
||||
print(f"\n从 {len(style_exprs)} 个表达方式中随机选择了 10 个进行匹配")
|
||||
|
||||
|
||||
return style_exprs
|
||||
|
||||
def find_similar_expressions_tfidf(input_text: str, expressions: List[Dict], mode: str = "both", top_k: int = 10) -> List[Tuple[str, str, float]]:
|
||||
|
||||
def find_similar_expressions_tfidf(
|
||||
input_text: str, expressions: List[Dict], mode: str = "both", top_k: int = 10
|
||||
) -> List[Tuple[str, str, float]]:
|
||||
"""使用TF-IDF方法找出与输入文本最相似的top_k个表达方式"""
|
||||
if not expressions:
|
||||
return []
|
||||
|
||||
|
||||
# 准备文本数据
|
||||
if mode == "style":
|
||||
texts = [expr['style'] for expr in expressions]
|
||||
texts = [expr["style"] for expr in expressions]
|
||||
elif mode == "situation":
|
||||
texts = [expr['situation'] for expr in expressions]
|
||||
texts = [expr["situation"] for expr in expressions]
|
||||
else: # both
|
||||
texts = [f"{expr['situation']} {expr['style']}" for expr in expressions]
|
||||
|
||||
|
||||
texts.append(input_text) # 添加输入文本
|
||||
|
||||
|
||||
# 使用TF-IDF向量化
|
||||
vectorizer = TfidfVectorizer()
|
||||
tfidf_matrix = vectorizer.fit_transform(texts)
|
||||
|
||||
|
||||
# 计算余弦相似度
|
||||
similarity_matrix = cosine_similarity(tfidf_matrix)
|
||||
|
||||
|
||||
# 获取输入文本的相似度分数(最后一行)
|
||||
scores = similarity_matrix[-1][:-1] # 排除与自身的相似度
|
||||
|
||||
|
||||
# 获取top_k的索引
|
||||
top_indices = np.argsort(scores)[::-1][:top_k]
|
||||
|
||||
|
||||
# 获取相似表达
|
||||
similar_exprs = []
|
||||
for idx in top_indices:
|
||||
if scores[idx] > 0: # 只保留有相似度的
|
||||
similar_exprs.append((
|
||||
expressions[idx]['style'],
|
||||
expressions[idx]['situation'],
|
||||
scores[idx]
|
||||
))
|
||||
|
||||
similar_exprs.append((expressions[idx]["style"], expressions[idx]["situation"], scores[idx]))
|
||||
|
||||
return similar_exprs
|
||||
|
||||
async def find_similar_expressions_embedding(input_text: str, expressions: List[Dict], mode: str = "both", top_k: int = 5) -> List[Tuple[str, str, float]]:
|
||||
|
||||
async def find_similar_expressions_embedding(
|
||||
input_text: str, expressions: List[Dict], mode: str = "both", top_k: int = 5
|
||||
) -> List[Tuple[str, str, float]]:
|
||||
"""使用嵌入模型找出与输入文本最相似的top_k个表达方式"""
|
||||
if not expressions:
|
||||
return []
|
||||
|
||||
|
||||
# 准备文本数据
|
||||
if mode == "style":
|
||||
texts = [expr['style'] for expr in expressions]
|
||||
texts = [expr["style"] for expr in expressions]
|
||||
elif mode == "situation":
|
||||
texts = [expr['situation'] for expr in expressions]
|
||||
texts = [expr["situation"] for expr in expressions]
|
||||
else: # both
|
||||
texts = [f"{expr['situation']} {expr['style']}" for expr in expressions]
|
||||
|
||||
|
||||
# 获取嵌入向量
|
||||
llm_request = LLMRequest(global_config.model.embedding)
|
||||
text_embeddings = []
|
||||
@@ -126,73 +132,70 @@ async def find_similar_expressions_embedding(input_text: str, expressions: List[
|
||||
embedding = await llm_request.get_embedding(text)
|
||||
if embedding:
|
||||
text_embeddings.append(embedding)
|
||||
|
||||
|
||||
input_embedding = await llm_request.get_embedding(input_text)
|
||||
if not input_embedding or not text_embeddings:
|
||||
return []
|
||||
|
||||
|
||||
# 计算余弦相似度
|
||||
text_embeddings = np.array(text_embeddings)
|
||||
similarities = np.dot(text_embeddings, input_embedding) / (
|
||||
np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(input_embedding)
|
||||
)
|
||||
|
||||
|
||||
# 获取top_k的索引
|
||||
top_indices = np.argsort(similarities)[::-1][:top_k]
|
||||
|
||||
|
||||
# 获取相似表达
|
||||
similar_exprs = []
|
||||
for idx in top_indices:
|
||||
if similarities[idx] > 0: # 只保留有相似度的
|
||||
similar_exprs.append((
|
||||
expressions[idx]['style'],
|
||||
expressions[idx]['situation'],
|
||||
similarities[idx]
|
||||
))
|
||||
|
||||
similar_exprs.append((expressions[idx]["style"], expressions[idx]["situation"], similarities[idx]))
|
||||
|
||||
return similar_exprs
|
||||
|
||||
|
||||
async def main():
|
||||
# 获取所有群聊ID
|
||||
style_dirs = glob.glob(os.path.join("data", "expression", "learnt_style", "*"))
|
||||
chat_ids = [os.path.basename(d) for d in style_dirs]
|
||||
|
||||
|
||||
if not chat_ids:
|
||||
print("没有找到任何群聊的表达方式数据")
|
||||
return
|
||||
|
||||
|
||||
print("可用的群聊:")
|
||||
for i, chat_id in enumerate(chat_ids, 1):
|
||||
group_name = get_group_name(chat_id)
|
||||
print(f"{i}. {group_name}")
|
||||
|
||||
|
||||
while True:
|
||||
try:
|
||||
choice = int(input("\n请选择要分析的群聊编号 (输入0退出): "))
|
||||
if choice == 0:
|
||||
break
|
||||
if 1 <= choice <= len(chat_ids):
|
||||
chat_id = chat_ids[choice-1]
|
||||
chat_id = chat_ids[choice - 1]
|
||||
break
|
||||
print("无效的选择,请重试")
|
||||
except ValueError:
|
||||
print("请输入有效的数字")
|
||||
|
||||
|
||||
if choice == 0:
|
||||
return
|
||||
|
||||
|
||||
# 加载表达方式
|
||||
style_exprs = load_expressions(chat_id)
|
||||
|
||||
|
||||
group_name = get_group_name(chat_id)
|
||||
print(f"\n已选择群聊:{group_name}")
|
||||
|
||||
|
||||
# 选择匹配模式
|
||||
print("\n请选择匹配模式:")
|
||||
print("1. 匹配表达方式")
|
||||
print("2. 匹配情景")
|
||||
print("3. 两者都考虑")
|
||||
|
||||
|
||||
while True:
|
||||
try:
|
||||
mode_choice = int(input("\n请选择匹配模式 (1-3): "))
|
||||
@@ -201,19 +204,15 @@ async def main():
|
||||
print("无效的选择,请重试")
|
||||
except ValueError:
|
||||
print("请输入有效的数字")
|
||||
|
||||
mode_map = {
|
||||
1: "style",
|
||||
2: "situation",
|
||||
3: "both"
|
||||
}
|
||||
|
||||
mode_map = {1: "style", 2: "situation", 3: "both"}
|
||||
mode = mode_map[mode_choice]
|
||||
|
||||
|
||||
# 选择匹配方法
|
||||
print("\n请选择匹配方法:")
|
||||
print("1. TF-IDF方法")
|
||||
print("2. 嵌入模型方法")
|
||||
|
||||
|
||||
while True:
|
||||
try:
|
||||
method_choice = int(input("\n请选择匹配方法 (1-2): "))
|
||||
@@ -222,20 +221,20 @@ async def main():
|
||||
print("无效的选择,请重试")
|
||||
except ValueError:
|
||||
print("请输入有效的数字")
|
||||
|
||||
|
||||
while True:
|
||||
input_text = input("\n请输入要匹配的文本(输入q退出): ")
|
||||
if input_text.lower() == 'q':
|
||||
if input_text.lower() == "q":
|
||||
break
|
||||
|
||||
|
||||
if not input_text.strip():
|
||||
continue
|
||||
|
||||
|
||||
if method_choice == 1:
|
||||
similar_exprs = find_similar_expressions_tfidf(input_text, style_exprs, mode)
|
||||
else:
|
||||
similar_exprs = await find_similar_expressions_embedding(input_text, style_exprs, mode)
|
||||
|
||||
|
||||
if similar_exprs:
|
||||
print("\n找到以下相似表达:")
|
||||
for style, situation, score in similar_exprs:
|
||||
@@ -246,6 +245,8 @@ async def main():
|
||||
else:
|
||||
print("\n没有找到相似的表达方式")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(main())
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user