修复代码格式和文件名大小写问题
This commit is contained in:
@@ -5,12 +5,11 @@ from typing import Dict, List
|
||||
|
||||
# Add project root to Python path
|
||||
from src.common.database.database_model import Expression, ChatStreams
|
||||
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, project_root)
|
||||
|
||||
|
||||
|
||||
|
||||
def get_chat_name(chat_id: str) -> str:
|
||||
"""Get chat name from chat_id by querying ChatStreams table directly"""
|
||||
try:
|
||||
@@ -18,7 +17,7 @@ def get_chat_name(chat_id: str) -> str:
|
||||
chat_stream = ChatStreams.get_or_none(ChatStreams.stream_id == chat_id)
|
||||
if chat_stream is None:
|
||||
return f"未知聊天 ({chat_id})"
|
||||
|
||||
|
||||
# 如果有群组信息,显示群组名称
|
||||
if chat_stream.group_name:
|
||||
return f"{chat_stream.group_name} ({chat_id})"
|
||||
@@ -35,117 +34,106 @@ def calculate_time_distribution(expressions) -> Dict[str, int]:
|
||||
"""Calculate distribution of last active time in days"""
|
||||
now = time.time()
|
||||
distribution = {
|
||||
'0-1天': 0,
|
||||
'1-3天': 0,
|
||||
'3-7天': 0,
|
||||
'7-14天': 0,
|
||||
'14-30天': 0,
|
||||
'30-60天': 0,
|
||||
'60-90天': 0,
|
||||
'90+天': 0
|
||||
"0-1天": 0,
|
||||
"1-3天": 0,
|
||||
"3-7天": 0,
|
||||
"7-14天": 0,
|
||||
"14-30天": 0,
|
||||
"30-60天": 0,
|
||||
"60-90天": 0,
|
||||
"90+天": 0,
|
||||
}
|
||||
for expr in expressions:
|
||||
diff_days = (now - expr.last_active_time) / (24*3600)
|
||||
diff_days = (now - expr.last_active_time) / (24 * 3600)
|
||||
if diff_days < 1:
|
||||
distribution['0-1天'] += 1
|
||||
distribution["0-1天"] += 1
|
||||
elif diff_days < 3:
|
||||
distribution['1-3天'] += 1
|
||||
distribution["1-3天"] += 1
|
||||
elif diff_days < 7:
|
||||
distribution['3-7天'] += 1
|
||||
distribution["3-7天"] += 1
|
||||
elif diff_days < 14:
|
||||
distribution['7-14天'] += 1
|
||||
distribution["7-14天"] += 1
|
||||
elif diff_days < 30:
|
||||
distribution['14-30天'] += 1
|
||||
distribution["14-30天"] += 1
|
||||
elif diff_days < 60:
|
||||
distribution['30-60天'] += 1
|
||||
distribution["30-60天"] += 1
|
||||
elif diff_days < 90:
|
||||
distribution['60-90天'] += 1
|
||||
distribution["60-90天"] += 1
|
||||
else:
|
||||
distribution['90+天'] += 1
|
||||
distribution["90+天"] += 1
|
||||
return distribution
|
||||
|
||||
|
||||
def calculate_count_distribution(expressions) -> Dict[str, int]:
|
||||
"""Calculate distribution of count values"""
|
||||
distribution = {
|
||||
'0-1': 0,
|
||||
'1-2': 0,
|
||||
'2-3': 0,
|
||||
'3-4': 0,
|
||||
'4-5': 0,
|
||||
'5-10': 0,
|
||||
'10+': 0
|
||||
}
|
||||
distribution = {"0-1": 0, "1-2": 0, "2-3": 0, "3-4": 0, "4-5": 0, "5-10": 0, "10+": 0}
|
||||
for expr in expressions:
|
||||
cnt = expr.count
|
||||
if cnt < 1:
|
||||
distribution['0-1'] += 1
|
||||
distribution["0-1"] += 1
|
||||
elif cnt < 2:
|
||||
distribution['1-2'] += 1
|
||||
distribution["1-2"] += 1
|
||||
elif cnt < 3:
|
||||
distribution['2-3'] += 1
|
||||
distribution["2-3"] += 1
|
||||
elif cnt < 4:
|
||||
distribution['3-4'] += 1
|
||||
distribution["3-4"] += 1
|
||||
elif cnt < 5:
|
||||
distribution['4-5'] += 1
|
||||
distribution["4-5"] += 1
|
||||
elif cnt < 10:
|
||||
distribution['5-10'] += 1
|
||||
distribution["5-10"] += 1
|
||||
else:
|
||||
distribution['10+'] += 1
|
||||
distribution["10+"] += 1
|
||||
return distribution
|
||||
|
||||
|
||||
def get_top_expressions_by_chat(chat_id: str, top_n: int = 5) -> List[Expression]:
|
||||
"""Get top N most used expressions for a specific chat_id"""
|
||||
return (Expression.select()
|
||||
.where(Expression.chat_id == chat_id)
|
||||
.order_by(Expression.count.desc())
|
||||
.limit(top_n))
|
||||
return Expression.select().where(Expression.chat_id == chat_id).order_by(Expression.count.desc()).limit(top_n)
|
||||
|
||||
|
||||
def show_overall_statistics(expressions, total: int) -> None:
|
||||
"""Show overall statistics"""
|
||||
time_dist = calculate_time_distribution(expressions)
|
||||
count_dist = calculate_count_distribution(expressions)
|
||||
|
||||
|
||||
print("\n=== 总体统计 ===")
|
||||
print(f"总表达式数量: {total}")
|
||||
|
||||
|
||||
print("\n上次激活时间分布:")
|
||||
for period, count in time_dist.items():
|
||||
print(f"{period}: {count} ({count/total*100:.2f}%)")
|
||||
|
||||
print(f"{period}: {count} ({count / total * 100:.2f}%)")
|
||||
|
||||
print("\ncount分布:")
|
||||
for range_, count in count_dist.items():
|
||||
print(f"{range_}: {count} ({count/total*100:.2f}%)")
|
||||
print(f"{range_}: {count} ({count / total * 100:.2f}%)")
|
||||
|
||||
|
||||
def show_chat_statistics(chat_id: str, chat_name: str) -> None:
|
||||
"""Show statistics for a specific chat"""
|
||||
chat_exprs = list(Expression.select().where(Expression.chat_id == chat_id))
|
||||
chat_total = len(chat_exprs)
|
||||
|
||||
|
||||
print(f"\n=== {chat_name} ===")
|
||||
print(f"表达式数量: {chat_total}")
|
||||
|
||||
|
||||
if chat_total == 0:
|
||||
print("该聊天没有表达式数据")
|
||||
return
|
||||
|
||||
|
||||
# Time distribution for this chat
|
||||
time_dist = calculate_time_distribution(chat_exprs)
|
||||
print("\n上次激活时间分布:")
|
||||
for period, count in time_dist.items():
|
||||
if count > 0:
|
||||
print(f"{period}: {count} ({count/chat_total*100:.2f}%)")
|
||||
|
||||
print(f"{period}: {count} ({count / chat_total * 100:.2f}%)")
|
||||
|
||||
# Count distribution for this chat
|
||||
count_dist = calculate_count_distribution(chat_exprs)
|
||||
print("\ncount分布:")
|
||||
for range_, count in count_dist.items():
|
||||
if count > 0:
|
||||
print(f"{range_}: {count} ({count/chat_total*100:.2f}%)")
|
||||
|
||||
print(f"{range_}: {count} ({count / chat_total * 100:.2f}%)")
|
||||
|
||||
# Top expressions
|
||||
print("\nTop 10使用最多的表达式:")
|
||||
top_exprs = get_top_expressions_by_chat(chat_id, 10)
|
||||
@@ -163,32 +151,32 @@ def interactive_menu() -> None:
|
||||
if not expressions:
|
||||
print("数据库中没有找到表达式")
|
||||
return
|
||||
|
||||
|
||||
total = len(expressions)
|
||||
|
||||
|
||||
# Get unique chat_ids and their names
|
||||
chat_ids = list(set(expr.chat_id for expr in expressions))
|
||||
chat_info = [(chat_id, get_chat_name(chat_id)) for chat_id in chat_ids]
|
||||
chat_info.sort(key=lambda x: x[1]) # Sort by chat name
|
||||
|
||||
|
||||
while True:
|
||||
print("\n" + "="*50)
|
||||
print("\n" + "=" * 50)
|
||||
print("表达式统计分析")
|
||||
print("="*50)
|
||||
print("=" * 50)
|
||||
print("0. 显示总体统计")
|
||||
|
||||
|
||||
for i, (chat_id, chat_name) in enumerate(chat_info, 1):
|
||||
chat_count = sum(1 for expr in expressions if expr.chat_id == chat_id)
|
||||
print(f"{i}. {chat_name} ({chat_count}个表达式)")
|
||||
|
||||
|
||||
print("q. 退出")
|
||||
|
||||
|
||||
choice = input("\n请选择要查看的统计 (输入序号): ").strip()
|
||||
|
||||
if choice.lower() == 'q':
|
||||
|
||||
if choice.lower() == "q":
|
||||
print("再见!")
|
||||
break
|
||||
|
||||
|
||||
try:
|
||||
choice_num = int(choice)
|
||||
if choice_num == 0:
|
||||
@@ -200,9 +188,9 @@ def interactive_menu() -> None:
|
||||
print("无效的选择,请重新输入")
|
||||
except ValueError:
|
||||
print("请输入有效的数字")
|
||||
|
||||
|
||||
input("\n按回车键继续...")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
interactive_menu()
|
||||
interactive_menu()
|
||||
|
||||
@@ -23,6 +23,7 @@ OPENIE_DIR = os.path.join(ROOT_PATH, "data", "openie")
|
||||
|
||||
logger = get_logger("OpenIE导入")
|
||||
|
||||
|
||||
def ensure_openie_dir():
|
||||
"""确保OpenIE数据目录存在"""
|
||||
if not os.path.exists(OPENIE_DIR):
|
||||
@@ -253,7 +254,7 @@ def main():
|
||||
# 没有运行的事件循环,创建新的
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
|
||||
try:
|
||||
# 在新的事件循环中运行异步主函数
|
||||
loop.run_until_complete(main_async())
|
||||
|
||||
@@ -12,6 +12,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||
from rich.progress import Progress # 替换为 rich 进度条
|
||||
|
||||
from src.common.logger import get_logger
|
||||
|
||||
# from src.chat.knowledge.lpmmconfig import global_config
|
||||
from src.chat.knowledge.ie_process import info_extract_from_str
|
||||
from src.chat.knowledge.open_ie import OpenIE
|
||||
@@ -36,6 +37,7 @@ TEMP_DIR = os.path.join(ROOT_PATH, "temp")
|
||||
# IMPORTED_DATA_PATH = os.path.join(ROOT_PATH, "data", "imported_lpmm_data")
|
||||
OPENIE_OUTPUT_DIR = os.path.join(ROOT_PATH, "data", "openie")
|
||||
|
||||
|
||||
def ensure_dirs():
|
||||
"""确保临时目录和输出目录存在"""
|
||||
if not os.path.exists(TEMP_DIR):
|
||||
@@ -48,6 +50,7 @@ def ensure_dirs():
|
||||
os.makedirs(RAW_DATA_PATH)
|
||||
logger.info(f"已创建原始数据目录: {RAW_DATA_PATH}")
|
||||
|
||||
|
||||
# 创建一个线程安全的锁,用于保护文件操作和共享数据
|
||||
file_lock = Lock()
|
||||
open_ie_doc_lock = Lock()
|
||||
@@ -56,13 +59,11 @@ open_ie_doc_lock = Lock()
|
||||
shutdown_event = Event()
|
||||
|
||||
lpmm_entity_extract_llm = LLMRequest(
|
||||
model_set=model_config.model_task_config.lpmm_entity_extract,
|
||||
request_type="lpmm.entity_extract"
|
||||
)
|
||||
lpmm_rdf_build_llm = LLMRequest(
|
||||
model_set=model_config.model_task_config.lpmm_rdf_build,
|
||||
request_type="lpmm.rdf_build"
|
||||
model_set=model_config.model_task_config.lpmm_entity_extract, request_type="lpmm.entity_extract"
|
||||
)
|
||||
lpmm_rdf_build_llm = LLMRequest(model_set=model_config.model_task_config.lpmm_rdf_build, request_type="lpmm.rdf_build")
|
||||
|
||||
|
||||
def process_single_text(pg_hash, raw_data):
|
||||
"""处理单个文本的函数,用于线程池"""
|
||||
temp_file_path = f"{TEMP_DIR}/{pg_hash}.json"
|
||||
@@ -97,7 +98,7 @@ def process_single_text(pg_hash, raw_data):
|
||||
with file_lock:
|
||||
try:
|
||||
with open(temp_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(orjson.dumps(doc_item, option=orjson.OPT_INDENT_2).decode('utf-8'))
|
||||
f.write(orjson.dumps(doc_item, option=orjson.OPT_INDENT_2).decode("utf-8"))
|
||||
except Exception as e:
|
||||
logger.error(f"保存缓存文件失败:{pg_hash}, 错误:{e}")
|
||||
# 如果保存失败,确保不会留下损坏的文件
|
||||
@@ -201,10 +202,10 @@ def main(): # sourcery skip: comprehension-to-generator, extract-method
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(
|
||||
orjson.dumps(
|
||||
openie_obj.to_dict() if hasattr(openie_obj, "to_dict") else openie_obj.__dict__,
|
||||
option=orjson.OPT_INDENT_2
|
||||
).decode('utf-8')
|
||||
)
|
||||
openie_obj.to_dict() if hasattr(openie_obj, "to_dict") else openie_obj.__dict__,
|
||||
option=orjson.OPT_INDENT_2,
|
||||
).decode("utf-8")
|
||||
)
|
||||
logger.info(f"信息提取结果已保存到: {output_path}")
|
||||
else:
|
||||
logger.warning("没有可保存的信息提取结果")
|
||||
|
||||
@@ -3,12 +3,11 @@ import sys
|
||||
import os
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from datetime import datetime
|
||||
|
||||
# Add project root to Python path
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, project_root)
|
||||
from src.common.database.database_model import Messages, ChatStreams #noqa
|
||||
|
||||
|
||||
from src.common.database.database_model import Messages, ChatStreams # noqa
|
||||
|
||||
|
||||
def get_chat_name(chat_id: str) -> str:
|
||||
@@ -17,7 +16,7 @@ def get_chat_name(chat_id: str) -> str:
|
||||
chat_stream = ChatStreams.get_or_none(ChatStreams.stream_id == chat_id)
|
||||
if chat_stream is None:
|
||||
return f"未知聊天 ({chat_id})"
|
||||
|
||||
|
||||
if chat_stream.group_name:
|
||||
return f"{chat_stream.group_name} ({chat_id})"
|
||||
elif chat_stream.user_nickname:
|
||||
@@ -39,66 +38,62 @@ def format_timestamp(timestamp: float) -> str:
|
||||
def calculate_interest_value_distribution(messages) -> Dict[str, int]:
|
||||
"""Calculate distribution of interest_value"""
|
||||
distribution = {
|
||||
'0.000-0.010': 0,
|
||||
'0.010-0.050': 0,
|
||||
'0.050-0.100': 0,
|
||||
'0.100-0.500': 0,
|
||||
'0.500-1.000': 0,
|
||||
'1.000-2.000': 0,
|
||||
'2.000-5.000': 0,
|
||||
'5.000-10.000': 0,
|
||||
'10.000+': 0
|
||||
"0.000-0.010": 0,
|
||||
"0.010-0.050": 0,
|
||||
"0.050-0.100": 0,
|
||||
"0.100-0.500": 0,
|
||||
"0.500-1.000": 0,
|
||||
"1.000-2.000": 0,
|
||||
"2.000-5.000": 0,
|
||||
"5.000-10.000": 0,
|
||||
"10.000+": 0,
|
||||
}
|
||||
|
||||
|
||||
for msg in messages:
|
||||
if msg.interest_value is None or msg.interest_value == 0.0:
|
||||
continue
|
||||
|
||||
|
||||
value = float(msg.interest_value)
|
||||
if value < 0.010:
|
||||
distribution['0.000-0.010'] += 1
|
||||
distribution["0.000-0.010"] += 1
|
||||
elif value < 0.050:
|
||||
distribution['0.010-0.050'] += 1
|
||||
distribution["0.010-0.050"] += 1
|
||||
elif value < 0.100:
|
||||
distribution['0.050-0.100'] += 1
|
||||
distribution["0.050-0.100"] += 1
|
||||
elif value < 0.500:
|
||||
distribution['0.100-0.500'] += 1
|
||||
distribution["0.100-0.500"] += 1
|
||||
elif value < 1.000:
|
||||
distribution['0.500-1.000'] += 1
|
||||
distribution["0.500-1.000"] += 1
|
||||
elif value < 2.000:
|
||||
distribution['1.000-2.000'] += 1
|
||||
distribution["1.000-2.000"] += 1
|
||||
elif value < 5.000:
|
||||
distribution['2.000-5.000'] += 1
|
||||
distribution["2.000-5.000"] += 1
|
||||
elif value < 10.000:
|
||||
distribution['5.000-10.000'] += 1
|
||||
distribution["5.000-10.000"] += 1
|
||||
else:
|
||||
distribution['10.000+'] += 1
|
||||
|
||||
distribution["10.000+"] += 1
|
||||
|
||||
return distribution
|
||||
|
||||
|
||||
def get_interest_value_stats(messages) -> Dict[str, float]:
|
||||
"""Calculate basic statistics for interest_value"""
|
||||
values = [float(msg.interest_value) for msg in messages if msg.interest_value is not None and msg.interest_value != 0.0]
|
||||
|
||||
values = [
|
||||
float(msg.interest_value) for msg in messages if msg.interest_value is not None and msg.interest_value != 0.0
|
||||
]
|
||||
|
||||
if not values:
|
||||
return {
|
||||
'count': 0,
|
||||
'min': 0,
|
||||
'max': 0,
|
||||
'avg': 0,
|
||||
'median': 0
|
||||
}
|
||||
|
||||
return {"count": 0, "min": 0, "max": 0, "avg": 0, "median": 0}
|
||||
|
||||
values.sort()
|
||||
count = len(values)
|
||||
|
||||
|
||||
return {
|
||||
'count': count,
|
||||
'min': min(values),
|
||||
'max': max(values),
|
||||
'avg': sum(values) / count,
|
||||
'median': values[count // 2] if count % 2 == 1 else (values[count // 2 - 1] + values[count // 2]) / 2
|
||||
"count": count,
|
||||
"min": min(values),
|
||||
"max": max(values),
|
||||
"avg": sum(values) / count,
|
||||
"median": values[count // 2] if count % 2 == 1 else (values[count // 2 - 1] + values[count // 2]) / 2,
|
||||
}
|
||||
|
||||
|
||||
@@ -109,20 +104,24 @@ def get_available_chats() -> List[Tuple[str, str, int]]:
|
||||
chat_counts = {}
|
||||
for msg in Messages.select(Messages.chat_id).distinct():
|
||||
chat_id = msg.chat_id
|
||||
count = Messages.select().where(
|
||||
(Messages.chat_id == chat_id) &
|
||||
(Messages.interest_value.is_null(False)) &
|
||||
(Messages.interest_value != 0.0)
|
||||
).count()
|
||||
count = (
|
||||
Messages.select()
|
||||
.where(
|
||||
(Messages.chat_id == chat_id)
|
||||
& (Messages.interest_value.is_null(False))
|
||||
& (Messages.interest_value != 0.0)
|
||||
)
|
||||
.count()
|
||||
)
|
||||
if count > 0:
|
||||
chat_counts[chat_id] = count
|
||||
|
||||
|
||||
# 获取聊天名称
|
||||
result = []
|
||||
for chat_id, count in chat_counts.items():
|
||||
chat_name = get_chat_name(chat_id)
|
||||
result.append((chat_id, chat_name, count))
|
||||
|
||||
|
||||
# 按消息数量排序
|
||||
result.sort(key=lambda x: x[2], reverse=True)
|
||||
return result
|
||||
@@ -135,30 +134,30 @@ def get_time_range_input() -> Tuple[Optional[float], Optional[float]]:
|
||||
"""Get time range input from user"""
|
||||
print("\n时间范围选择:")
|
||||
print("1. 最近1天")
|
||||
print("2. 最近3天")
|
||||
print("2. 最近3天")
|
||||
print("3. 最近7天")
|
||||
print("4. 最近30天")
|
||||
print("5. 自定义时间范围")
|
||||
print("6. 不限制时间")
|
||||
|
||||
|
||||
choice = input("请选择时间范围 (1-6): ").strip()
|
||||
|
||||
|
||||
now = time.time()
|
||||
|
||||
|
||||
if choice == "1":
|
||||
return now - 24*3600, now
|
||||
return now - 24 * 3600, now
|
||||
elif choice == "2":
|
||||
return now - 3*24*3600, now
|
||||
return now - 3 * 24 * 3600, now
|
||||
elif choice == "3":
|
||||
return now - 7*24*3600, now
|
||||
return now - 7 * 24 * 3600, now
|
||||
elif choice == "4":
|
||||
return now - 30*24*3600, now
|
||||
return now - 30 * 24 * 3600, now
|
||||
elif choice == "5":
|
||||
print("请输入开始时间 (格式: YYYY-MM-DD HH:MM:SS):")
|
||||
start_str = input().strip()
|
||||
print("请输入结束时间 (格式: YYYY-MM-DD HH:MM:SS):")
|
||||
end_str = input().strip()
|
||||
|
||||
|
||||
try:
|
||||
start_time = datetime.strptime(start_str, "%Y-%m-%d %H:%M:%S").timestamp()
|
||||
end_time = datetime.strptime(end_str, "%Y-%m-%d %H:%M:%S").timestamp()
|
||||
@@ -170,41 +169,40 @@ def get_time_range_input() -> Tuple[Optional[float], Optional[float]]:
|
||||
return None, None
|
||||
|
||||
|
||||
def analyze_interest_values(chat_id: Optional[str] = None, start_time: Optional[float] = None, end_time: Optional[float] = None) -> None:
|
||||
def analyze_interest_values(
|
||||
chat_id: Optional[str] = None, start_time: Optional[float] = None, end_time: Optional[float] = None
|
||||
) -> None:
|
||||
"""Analyze interest values with optional filters"""
|
||||
|
||||
|
||||
# 构建查询条件
|
||||
query = Messages.select().where(
|
||||
(Messages.interest_value.is_null(False)) &
|
||||
(Messages.interest_value != 0.0)
|
||||
)
|
||||
|
||||
query = Messages.select().where((Messages.interest_value.is_null(False)) & (Messages.interest_value != 0.0))
|
||||
|
||||
if chat_id:
|
||||
query = query.where(Messages.chat_id == chat_id)
|
||||
|
||||
|
||||
if start_time:
|
||||
query = query.where(Messages.time >= start_time)
|
||||
|
||||
|
||||
if end_time:
|
||||
query = query.where(Messages.time <= end_time)
|
||||
|
||||
|
||||
messages = list(query)
|
||||
|
||||
|
||||
if not messages:
|
||||
print("没有找到符合条件的消息")
|
||||
return
|
||||
|
||||
|
||||
# 计算统计信息
|
||||
distribution = calculate_interest_value_distribution(messages)
|
||||
stats = get_interest_value_stats(messages)
|
||||
|
||||
|
||||
# 显示结果
|
||||
print("\n=== Interest Value 分析结果 ===")
|
||||
if chat_id:
|
||||
print(f"聊天: {get_chat_name(chat_id)}")
|
||||
else:
|
||||
print("聊天: 全部聊天")
|
||||
|
||||
|
||||
if start_time and end_time:
|
||||
print(f"时间范围: {format_timestamp(start_time)} 到 {format_timestamp(end_time)}")
|
||||
elif start_time:
|
||||
@@ -213,16 +211,16 @@ def analyze_interest_values(chat_id: Optional[str] = None, start_time: Optional[
|
||||
print(f"时间范围: {format_timestamp(end_time)} 之前")
|
||||
else:
|
||||
print("时间范围: 不限制")
|
||||
|
||||
|
||||
print("\n基本统计:")
|
||||
print(f"有效消息数量: {stats['count']} (排除null和0值)")
|
||||
print(f"最小值: {stats['min']:.3f}")
|
||||
print(f"最大值: {stats['max']:.3f}")
|
||||
print(f"平均值: {stats['avg']:.3f}")
|
||||
print(f"中位数: {stats['median']:.3f}")
|
||||
|
||||
|
||||
print("\nInterest Value 分布:")
|
||||
total = stats['count']
|
||||
total = stats["count"]
|
||||
for range_name, count in distribution.items():
|
||||
if count > 0:
|
||||
percentage = count / total * 100
|
||||
@@ -231,34 +229,34 @@ def analyze_interest_values(chat_id: Optional[str] = None, start_time: Optional[
|
||||
|
||||
def interactive_menu() -> None:
|
||||
"""Interactive menu for interest value analysis"""
|
||||
|
||||
|
||||
while True:
|
||||
print("\n" + "="*50)
|
||||
print("\n" + "=" * 50)
|
||||
print("Interest Value 分析工具")
|
||||
print("="*50)
|
||||
print("=" * 50)
|
||||
print("1. 分析全部聊天")
|
||||
print("2. 选择特定聊天分析")
|
||||
print("q. 退出")
|
||||
|
||||
|
||||
choice = input("\n请选择分析模式 (1-2, q): ").strip()
|
||||
|
||||
if choice.lower() == 'q':
|
||||
|
||||
if choice.lower() == "q":
|
||||
print("再见!")
|
||||
break
|
||||
|
||||
|
||||
chat_id = None
|
||||
|
||||
|
||||
if choice == "2":
|
||||
# 显示可用的聊天列表
|
||||
chats = get_available_chats()
|
||||
if not chats:
|
||||
print("没有找到有interest_value数据的聊天")
|
||||
continue
|
||||
|
||||
|
||||
print(f"\n可用的聊天 (共{len(chats)}个):")
|
||||
for i, (_cid, name, count) in enumerate(chats, 1):
|
||||
print(f"{i}. {name} ({count}条有效消息)")
|
||||
|
||||
|
||||
try:
|
||||
chat_choice = int(input(f"\n请选择聊天 (1-{len(chats)}): ").strip())
|
||||
if 1 <= chat_choice <= len(chats):
|
||||
@@ -269,19 +267,19 @@ def interactive_menu() -> None:
|
||||
except ValueError:
|
||||
print("请输入有效数字")
|
||||
continue
|
||||
|
||||
|
||||
elif choice != "1":
|
||||
print("无效选择")
|
||||
continue
|
||||
|
||||
|
||||
# 获取时间范围
|
||||
start_time, end_time = get_time_range_input()
|
||||
|
||||
|
||||
# 执行分析
|
||||
analyze_interest_values(chat_id, start_time, end_time)
|
||||
|
||||
|
||||
input("\n按回车键继续...")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
interactive_menu()
|
||||
interactive_menu()
|
||||
|
||||
@@ -199,7 +199,7 @@ class LogFormatter:
|
||||
parts.append(event)
|
||||
elif isinstance(event, dict):
|
||||
try:
|
||||
parts.append(orjson.dumps(event).decode('utf-8'))
|
||||
parts.append(orjson.dumps(event).decode("utf-8"))
|
||||
except (TypeError, ValueError):
|
||||
parts.append(str(event))
|
||||
else:
|
||||
@@ -212,7 +212,7 @@ class LogFormatter:
|
||||
if key not in ("timestamp", "level", "logger_name", "event"):
|
||||
if isinstance(value, (dict, list)):
|
||||
try:
|
||||
value_str = orjson.dumps(value).decode('utf-8')
|
||||
value_str = orjson.dumps(value).decode("utf-8")
|
||||
except (TypeError, ValueError):
|
||||
value_str = str(value)
|
||||
else:
|
||||
@@ -829,7 +829,7 @@ class LogViewer:
|
||||
parts, tags = self.formatter.format_log_entry(log_entry)
|
||||
line_text = " ".join(parts)
|
||||
log_lines.append(line_text)
|
||||
|
||||
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(log_lines))
|
||||
messagebox.showinfo("导出成功", f"日志已导出到: {filename}")
|
||||
@@ -855,10 +855,7 @@ class LogViewer:
|
||||
mapping_file.parent.mkdir(exist_ok=True)
|
||||
try:
|
||||
with open(mapping_file, "w", encoding="utf-8") as f:
|
||||
f.write(orjson.dumps(
|
||||
self.module_name_mapping,
|
||||
option=orjson.OPT_INDENT_2
|
||||
).decode('utf-8'))
|
||||
f.write(orjson.dumps(self.module_name_mapping, option=orjson.OPT_INDENT_2).decode("utf-8"))
|
||||
except Exception as e:
|
||||
print(f"保存模块映射失败: {e}")
|
||||
|
||||
@@ -1192,15 +1189,16 @@ class LogViewer:
|
||||
line_count += 1
|
||||
except orjson.JSONDecodeError:
|
||||
continue
|
||||
|
||||
|
||||
# 如果发现了新模块,在主线程中更新模块集合
|
||||
if new_modules:
|
||||
|
||||
def update_modules():
|
||||
self.modules.update(new_modules)
|
||||
self.update_module_list()
|
||||
|
||||
|
||||
self.root.after(0, update_modules)
|
||||
|
||||
|
||||
return new_entries
|
||||
|
||||
def append_new_logs(self, new_entries):
|
||||
@@ -1428,4 +1426,3 @@ def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
@@ -51,10 +51,7 @@ def create_minimal_manifest(plugin_dir: str, plugin_name: str, description: str
|
||||
|
||||
try:
|
||||
with open(manifest_path, "w", encoding="utf-8") as f:
|
||||
f.write(orjson.dumps(
|
||||
minimal_manifest,
|
||||
option=orjson.OPT_INDENT_2
|
||||
).decode('utf-8'))
|
||||
f.write(orjson.dumps(minimal_manifest, option=orjson.OPT_INDENT_2).decode("utf-8"))
|
||||
print(f"✅ 已创建最小化manifest文件: {manifest_path}")
|
||||
return True
|
||||
except Exception as e:
|
||||
@@ -102,10 +99,7 @@ def create_complete_manifest(plugin_dir: str, plugin_name: str) -> bool:
|
||||
|
||||
try:
|
||||
with open(manifest_path, "w", encoding="utf-8") as f:
|
||||
f.write(orjson.dumps(
|
||||
complete_manifest,
|
||||
option=orjson.OPT_INDENT_2
|
||||
).decode('utf-8'))
|
||||
f.write(orjson.dumps(complete_manifest, option=orjson.OPT_INDENT_2).decode("utf-8"))
|
||||
print(f"✅ 已创建完整manifest模板: {manifest_path}")
|
||||
print("💡 请根据实际情况修改manifest文件中的内容")
|
||||
return True
|
||||
|
||||
@@ -2,6 +2,7 @@ import os
|
||||
from pathlib import Path
|
||||
import sys # 新增系统模块导入
|
||||
from src.chat.knowledge.utils.hash import get_sha256
|
||||
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
||||
from src.common.logger import get_logger
|
||||
|
||||
@@ -10,6 +11,7 @@ ROOT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
RAW_DATA_PATH = os.path.join(ROOT_PATH, "data/lpmm_raw_data")
|
||||
# IMPORTED_DATA_PATH = os.path.join(ROOT_PATH, "data/imported_lpmm_data")
|
||||
|
||||
|
||||
def _process_text_file(file_path):
|
||||
"""处理单个文本文件,返回段落列表"""
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
@@ -44,6 +46,7 @@ def _process_multi_files() -> list:
|
||||
all_paragraphs.extend(paragraphs)
|
||||
return all_paragraphs
|
||||
|
||||
|
||||
def load_raw_data() -> tuple[list[str], list[str]]:
|
||||
"""加载原始数据文件
|
||||
|
||||
@@ -72,4 +75,4 @@ def load_raw_data() -> tuple[list[str], list[str]]:
|
||||
raw_data.append(item)
|
||||
logger.info(f"共读取到{len(raw_data)}条数据")
|
||||
|
||||
return sha256_list, raw_data
|
||||
return sha256_list, raw_data
|
||||
|
||||
@@ -4,21 +4,22 @@ import os
|
||||
import re
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from datetime import datetime
|
||||
|
||||
# Add project root to Python path
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, project_root)
|
||||
from src.common.database.database_model import Messages, ChatStreams #noqa
|
||||
from src.common.database.database_model import Messages, ChatStreams # noqa
|
||||
|
||||
|
||||
def contains_emoji_or_image_tags(text: str) -> bool:
|
||||
"""Check if text contains [表情包xxxxx] or [图片xxxxx] tags"""
|
||||
if not text:
|
||||
return False
|
||||
|
||||
|
||||
# 检查是否包含 [表情包] 或 [图片] 标记
|
||||
emoji_pattern = r'\[表情包[^\]]*\]'
|
||||
image_pattern = r'\[图片[^\]]*\]'
|
||||
|
||||
emoji_pattern = r"\[表情包[^\]]*\]"
|
||||
image_pattern = r"\[图片[^\]]*\]"
|
||||
|
||||
return bool(re.search(emoji_pattern, text) or re.search(image_pattern, text))
|
||||
|
||||
|
||||
@@ -26,14 +27,14 @@ def clean_reply_text(text: str) -> str:
|
||||
"""Remove reply references like [回复 xxxx...] from text"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
|
||||
# 匹配 [回复 xxxx...] 格式的内容
|
||||
# 使用非贪婪匹配,匹配到第一个 ] 就停止
|
||||
cleaned_text = re.sub(r'\[回复[^\]]*\]', '', text)
|
||||
|
||||
cleaned_text = re.sub(r"\[回复[^\]]*\]", "", text)
|
||||
|
||||
# 去除多余的空白字符
|
||||
cleaned_text = cleaned_text.strip()
|
||||
|
||||
|
||||
return cleaned_text
|
||||
|
||||
|
||||
@@ -43,7 +44,7 @@ def get_chat_name(chat_id: str) -> str:
|
||||
chat_stream = ChatStreams.get_or_none(ChatStreams.stream_id == chat_id)
|
||||
if chat_stream is None:
|
||||
return f"未知聊天 ({chat_id})"
|
||||
|
||||
|
||||
if chat_stream.group_name:
|
||||
return f"{chat_stream.group_name} ({chat_id})"
|
||||
elif chat_stream.user_nickname:
|
||||
@@ -65,63 +66,63 @@ def format_timestamp(timestamp: float) -> str:
|
||||
def calculate_text_length_distribution(messages) -> Dict[str, int]:
|
||||
"""Calculate distribution of processed_plain_text length"""
|
||||
distribution = {
|
||||
'0': 0, # 空文本
|
||||
'1-5': 0, # 极短文本
|
||||
'6-10': 0, # 很短文本
|
||||
'11-20': 0, # 短文本
|
||||
'21-30': 0, # 较短文本
|
||||
'31-50': 0, # 中短文本
|
||||
'51-70': 0, # 中等文本
|
||||
'71-100': 0, # 较长文本
|
||||
'101-150': 0, # 长文本
|
||||
'151-200': 0, # 很长文本
|
||||
'201-300': 0, # 超长文本
|
||||
'301-500': 0, # 极长文本
|
||||
'501-1000': 0, # 巨长文本
|
||||
'1000+': 0 # 超巨长文本
|
||||
"0": 0, # 空文本
|
||||
"1-5": 0, # 极短文本
|
||||
"6-10": 0, # 很短文本
|
||||
"11-20": 0, # 短文本
|
||||
"21-30": 0, # 较短文本
|
||||
"31-50": 0, # 中短文本
|
||||
"51-70": 0, # 中等文本
|
||||
"71-100": 0, # 较长文本
|
||||
"101-150": 0, # 长文本
|
||||
"151-200": 0, # 很长文本
|
||||
"201-300": 0, # 超长文本
|
||||
"301-500": 0, # 极长文本
|
||||
"501-1000": 0, # 巨长文本
|
||||
"1000+": 0, # 超巨长文本
|
||||
}
|
||||
|
||||
|
||||
for msg in messages:
|
||||
if msg.processed_plain_text is None:
|
||||
continue
|
||||
|
||||
|
||||
# 排除包含表情包或图片标记的消息
|
||||
if contains_emoji_or_image_tags(msg.processed_plain_text):
|
||||
continue
|
||||
|
||||
|
||||
# 清理文本中的回复引用
|
||||
cleaned_text = clean_reply_text(msg.processed_plain_text)
|
||||
length = len(cleaned_text)
|
||||
|
||||
|
||||
if length == 0:
|
||||
distribution['0'] += 1
|
||||
distribution["0"] += 1
|
||||
elif length <= 5:
|
||||
distribution['1-5'] += 1
|
||||
distribution["1-5"] += 1
|
||||
elif length <= 10:
|
||||
distribution['6-10'] += 1
|
||||
distribution["6-10"] += 1
|
||||
elif length <= 20:
|
||||
distribution['11-20'] += 1
|
||||
distribution["11-20"] += 1
|
||||
elif length <= 30:
|
||||
distribution['21-30'] += 1
|
||||
distribution["21-30"] += 1
|
||||
elif length <= 50:
|
||||
distribution['31-50'] += 1
|
||||
distribution["31-50"] += 1
|
||||
elif length <= 70:
|
||||
distribution['51-70'] += 1
|
||||
distribution["51-70"] += 1
|
||||
elif length <= 100:
|
||||
distribution['71-100'] += 1
|
||||
distribution["71-100"] += 1
|
||||
elif length <= 150:
|
||||
distribution['101-150'] += 1
|
||||
distribution["101-150"] += 1
|
||||
elif length <= 200:
|
||||
distribution['151-200'] += 1
|
||||
distribution["151-200"] += 1
|
||||
elif length <= 300:
|
||||
distribution['201-300'] += 1
|
||||
distribution["201-300"] += 1
|
||||
elif length <= 500:
|
||||
distribution['301-500'] += 1
|
||||
distribution["301-500"] += 1
|
||||
elif length <= 1000:
|
||||
distribution['501-1000'] += 1
|
||||
distribution["501-1000"] += 1
|
||||
else:
|
||||
distribution['1000+'] += 1
|
||||
|
||||
distribution["1000+"] += 1
|
||||
|
||||
return distribution
|
||||
|
||||
|
||||
@@ -130,7 +131,7 @@ def get_text_length_stats(messages) -> Dict[str, float]:
|
||||
lengths = []
|
||||
null_count = 0
|
||||
excluded_count = 0 # 被排除的消息数量
|
||||
|
||||
|
||||
for msg in messages:
|
||||
if msg.processed_plain_text is None:
|
||||
null_count += 1
|
||||
@@ -141,29 +142,29 @@ def get_text_length_stats(messages) -> Dict[str, float]:
|
||||
# 清理文本中的回复引用
|
||||
cleaned_text = clean_reply_text(msg.processed_plain_text)
|
||||
lengths.append(len(cleaned_text))
|
||||
|
||||
|
||||
if not lengths:
|
||||
return {
|
||||
'count': 0,
|
||||
'null_count': null_count,
|
||||
'excluded_count': excluded_count,
|
||||
'min': 0,
|
||||
'max': 0,
|
||||
'avg': 0,
|
||||
'median': 0
|
||||
"count": 0,
|
||||
"null_count": null_count,
|
||||
"excluded_count": excluded_count,
|
||||
"min": 0,
|
||||
"max": 0,
|
||||
"avg": 0,
|
||||
"median": 0,
|
||||
}
|
||||
|
||||
|
||||
lengths.sort()
|
||||
count = len(lengths)
|
||||
|
||||
|
||||
return {
|
||||
'count': count,
|
||||
'null_count': null_count,
|
||||
'excluded_count': excluded_count,
|
||||
'min': min(lengths),
|
||||
'max': max(lengths),
|
||||
'avg': sum(lengths) / count,
|
||||
'median': lengths[count // 2] if count % 2 == 1 else (lengths[count // 2 - 1] + lengths[count // 2]) / 2
|
||||
"count": count,
|
||||
"null_count": null_count,
|
||||
"excluded_count": excluded_count,
|
||||
"min": min(lengths),
|
||||
"max": max(lengths),
|
||||
"avg": sum(lengths) / count,
|
||||
"median": lengths[count // 2] if count % 2 == 1 else (lengths[count // 2 - 1] + lengths[count // 2]) / 2,
|
||||
}
|
||||
|
||||
|
||||
@@ -174,21 +175,25 @@ def get_available_chats() -> List[Tuple[str, str, int]]:
|
||||
chat_counts = {}
|
||||
for msg in Messages.select(Messages.chat_id).distinct():
|
||||
chat_id = msg.chat_id
|
||||
count = Messages.select().where(
|
||||
(Messages.chat_id == chat_id) &
|
||||
(Messages.is_emoji != 1) &
|
||||
(Messages.is_picid != 1) &
|
||||
(Messages.is_command != 1)
|
||||
).count()
|
||||
count = (
|
||||
Messages.select()
|
||||
.where(
|
||||
(Messages.chat_id == chat_id)
|
||||
& (Messages.is_emoji != 1)
|
||||
& (Messages.is_picid != 1)
|
||||
& (Messages.is_command != 1)
|
||||
)
|
||||
.count()
|
||||
)
|
||||
if count > 0:
|
||||
chat_counts[chat_id] = count
|
||||
|
||||
|
||||
# 获取聊天名称
|
||||
result = []
|
||||
for chat_id, count in chat_counts.items():
|
||||
chat_name = get_chat_name(chat_id)
|
||||
result.append((chat_id, chat_name, count))
|
||||
|
||||
|
||||
# 按消息数量排序
|
||||
result.sort(key=lambda x: x[2], reverse=True)
|
||||
return result
|
||||
@@ -201,30 +206,30 @@ def get_time_range_input() -> Tuple[Optional[float], Optional[float]]:
|
||||
"""Get time range input from user"""
|
||||
print("\n时间范围选择:")
|
||||
print("1. 最近1天")
|
||||
print("2. 最近3天")
|
||||
print("2. 最近3天")
|
||||
print("3. 最近7天")
|
||||
print("4. 最近30天")
|
||||
print("5. 自定义时间范围")
|
||||
print("6. 不限制时间")
|
||||
|
||||
|
||||
choice = input("请选择时间范围 (1-6): ").strip()
|
||||
|
||||
|
||||
now = time.time()
|
||||
|
||||
|
||||
if choice == "1":
|
||||
return now - 24*3600, now
|
||||
return now - 24 * 3600, now
|
||||
elif choice == "2":
|
||||
return now - 3*24*3600, now
|
||||
return now - 3 * 24 * 3600, now
|
||||
elif choice == "3":
|
||||
return now - 7*24*3600, now
|
||||
return now - 7 * 24 * 3600, now
|
||||
elif choice == "4":
|
||||
return now - 30*24*3600, now
|
||||
return now - 30 * 24 * 3600, now
|
||||
elif choice == "5":
|
||||
print("请输入开始时间 (格式: YYYY-MM-DD HH:MM:SS):")
|
||||
start_str = input().strip()
|
||||
print("请输入结束时间 (格式: YYYY-MM-DD HH:MM:SS):")
|
||||
end_str = input().strip()
|
||||
|
||||
|
||||
try:
|
||||
start_time = datetime.strptime(start_str, "%Y-%m-%d %H:%M:%S").timestamp()
|
||||
end_time = datetime.strptime(end_str, "%Y-%m-%d %H:%M:%S").timestamp()
|
||||
@@ -239,13 +244,13 @@ def get_time_range_input() -> Tuple[Optional[float], Optional[float]]:
|
||||
def get_top_longest_messages(messages, top_n: int = 10) -> List[Tuple[str, int, str, str]]:
|
||||
"""Get top N longest messages"""
|
||||
message_lengths = []
|
||||
|
||||
|
||||
for msg in messages:
|
||||
if msg.processed_plain_text is not None:
|
||||
# 排除包含表情包或图片标记的消息
|
||||
if contains_emoji_or_image_tags(msg.processed_plain_text):
|
||||
continue
|
||||
|
||||
|
||||
# 清理文本中的回复引用
|
||||
cleaned_text = clean_reply_text(msg.processed_plain_text)
|
||||
length = len(cleaned_text)
|
||||
@@ -254,42 +259,40 @@ def get_top_longest_messages(messages, top_n: int = 10) -> List[Tuple[str, int,
|
||||
# 截取前100个字符作为预览
|
||||
preview = cleaned_text[:100] + "..." if len(cleaned_text) > 100 else cleaned_text
|
||||
message_lengths.append((chat_name, length, time_str, preview))
|
||||
|
||||
|
||||
# 按长度排序,取前N个
|
||||
message_lengths.sort(key=lambda x: x[1], reverse=True)
|
||||
return message_lengths[:top_n]
|
||||
|
||||
|
||||
def analyze_text_lengths(chat_id: Optional[str] = None, start_time: Optional[float] = None, end_time: Optional[float] = None) -> None:
|
||||
def analyze_text_lengths(
|
||||
chat_id: Optional[str] = None, start_time: Optional[float] = None, end_time: Optional[float] = None
|
||||
) -> None:
|
||||
"""Analyze processed_plain_text lengths with optional filters"""
|
||||
|
||||
|
||||
# 构建查询条件,排除特殊类型的消息
|
||||
query = Messages.select().where(
|
||||
(Messages.is_emoji != 1) &
|
||||
(Messages.is_picid != 1) &
|
||||
(Messages.is_command != 1)
|
||||
)
|
||||
|
||||
query = Messages.select().where((Messages.is_emoji != 1) & (Messages.is_picid != 1) & (Messages.is_command != 1))
|
||||
|
||||
if chat_id:
|
||||
query = query.where(Messages.chat_id == chat_id)
|
||||
|
||||
|
||||
if start_time:
|
||||
query = query.where(Messages.time >= start_time)
|
||||
|
||||
|
||||
if end_time:
|
||||
query = query.where(Messages.time <= end_time)
|
||||
|
||||
|
||||
messages = list(query)
|
||||
|
||||
|
||||
if not messages:
|
||||
print("没有找到符合条件的消息")
|
||||
return
|
||||
|
||||
|
||||
# 计算统计信息
|
||||
distribution = calculate_text_length_distribution(messages)
|
||||
stats = get_text_length_stats(messages)
|
||||
top_longest = get_top_longest_messages(messages, 10)
|
||||
|
||||
|
||||
# 显示结果
|
||||
print("\n=== Processed Plain Text 长度分析结果 ===")
|
||||
print("(已排除表情、图片ID、命令类型消息,已排除[表情包]和[图片]标记消息,已清理回复引用)")
|
||||
@@ -297,7 +300,7 @@ def analyze_text_lengths(chat_id: Optional[str] = None, start_time: Optional[flo
|
||||
print(f"聊天: {get_chat_name(chat_id)}")
|
||||
else:
|
||||
print("聊天: 全部聊天")
|
||||
|
||||
|
||||
if start_time and end_time:
|
||||
print(f"时间范围: {format_timestamp(start_time)} 到 {format_timestamp(end_time)}")
|
||||
elif start_time:
|
||||
@@ -306,26 +309,26 @@ def analyze_text_lengths(chat_id: Optional[str] = None, start_time: Optional[flo
|
||||
print(f"时间范围: {format_timestamp(end_time)} 之前")
|
||||
else:
|
||||
print("时间范围: 不限制")
|
||||
|
||||
|
||||
print("\n基本统计:")
|
||||
print(f"总消息数量: {len(messages)}")
|
||||
print(f"有文本消息数量: {stats['count']}")
|
||||
print(f"空文本消息数量: {stats['null_count']}")
|
||||
print(f"被排除的消息数量: {stats['excluded_count']}")
|
||||
if stats['count'] > 0:
|
||||
if stats["count"] > 0:
|
||||
print(f"最短长度: {stats['min']} 字符")
|
||||
print(f"最长长度: {stats['max']} 字符")
|
||||
print(f"平均长度: {stats['avg']:.2f} 字符")
|
||||
print(f"中位数长度: {stats['median']:.2f} 字符")
|
||||
|
||||
|
||||
print("\n文本长度分布:")
|
||||
total = stats['count']
|
||||
total = stats["count"]
|
||||
if total > 0:
|
||||
for range_name, count in distribution.items():
|
||||
if count > 0:
|
||||
percentage = count / total * 100
|
||||
print(f"{range_name} 字符: {count} ({percentage:.2f}%)")
|
||||
|
||||
|
||||
# 显示最长的消息
|
||||
if top_longest:
|
||||
print(f"\n最长的 {len(top_longest)} 条消息:")
|
||||
@@ -338,34 +341,34 @@ def analyze_text_lengths(chat_id: Optional[str] = None, start_time: Optional[flo
|
||||
|
||||
def interactive_menu() -> None:
|
||||
"""Interactive menu for text length analysis"""
|
||||
|
||||
|
||||
while True:
|
||||
print("\n" + "="*50)
|
||||
print("\n" + "=" * 50)
|
||||
print("Processed Plain Text 长度分析工具")
|
||||
print("="*50)
|
||||
print("=" * 50)
|
||||
print("1. 分析全部聊天")
|
||||
print("2. 选择特定聊天分析")
|
||||
print("q. 退出")
|
||||
|
||||
|
||||
choice = input("\n请选择分析模式 (1-2, q): ").strip()
|
||||
|
||||
if choice.lower() == 'q':
|
||||
|
||||
if choice.lower() == "q":
|
||||
print("再见!")
|
||||
break
|
||||
|
||||
|
||||
chat_id = None
|
||||
|
||||
|
||||
if choice == "2":
|
||||
# 显示可用的聊天列表
|
||||
chats = get_available_chats()
|
||||
if not chats:
|
||||
print("没有找到聊天数据")
|
||||
continue
|
||||
|
||||
|
||||
print(f"\n可用的聊天 (共{len(chats)}个):")
|
||||
for i, (_cid, name, count) in enumerate(chats, 1):
|
||||
print(f"{i}. {name} ({count}条消息)")
|
||||
|
||||
|
||||
try:
|
||||
chat_choice = int(input(f"\n请选择聊天 (1-{len(chats)}): ").strip())
|
||||
if 1 <= chat_choice <= len(chats):
|
||||
@@ -376,19 +379,19 @@ def interactive_menu() -> None:
|
||||
except ValueError:
|
||||
print("请输入有效数字")
|
||||
continue
|
||||
|
||||
|
||||
elif choice != "1":
|
||||
print("无效选择")
|
||||
continue
|
||||
|
||||
|
||||
# 获取时间范围
|
||||
start_time, end_time = get_time_range_input()
|
||||
|
||||
|
||||
# 执行分析
|
||||
analyze_text_lengths(chat_id, start_time, end_time)
|
||||
|
||||
|
||||
input("\n按回车键继续...")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
interactive_menu()
|
||||
interactive_menu()
|
||||
|
||||
Reference in New Issue
Block a user