Files
Mofox-Core/src/chat/semantic_interest/model_lr.py
Windpicker-owo e8bffe4a87 feat: 实现TF-IDF特征提取器和逻辑回归模型用于语义兴趣评分
- 新增了TfidfFeatureExtractor,用于字符级n-gram的TF-IDF向量化,适用于中文及多语言场景。
- 基于逻辑回归开发了语义兴趣模型,用于多类别兴趣标签(-1、0、1)的预测。
- 创建了在线推理的运行时评分器,实现消息兴趣评分的快速评估。
建立了模型训练、评估和数据集准备的全流程培训体系。
- 集成模型管理,支持热加载与个性化模型选择。
2025-12-11 21:28:27 +08:00

266 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Logistic Regression 模型训练与推理
使用多分类 Logistic Regression 预测消息的兴趣度标签 (-1, 0, 1)
"""
import time
from pathlib import Path
from typing import Any
import joblib
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from src.common.logger import get_logger
from src.chat.semantic_interest.features_tfidf import TfidfFeatureExtractor
logger = get_logger("semantic_interest.model")
class SemanticInterestModel:
"""语义兴趣度模型
使用 Logistic Regression 进行多分类(-1: 不感兴趣, 0: 中立, 1: 感兴趣)
"""
def __init__(
self,
class_weight: str | dict | None = "balanced",
max_iter: int = 1000,
solver: str = "lbfgs", # type: ignore
n_jobs: int = -1,
):
"""初始化模型
Args:
class_weight: 类别权重配置
- "balanced": 自动平衡类别权重
- dict: 自定义权重,如 {-1: 0.8, 0: 0.6, 1: 1.6}
- None: 不使用权重
max_iter: 最大迭代次数
solver: 求解器 ('lbfgs', 'saga', 'liblinear' 等)
n_jobs: 并行任务数,-1 表示使用所有 CPU 核心
"""
self.clf = LogisticRegression(
multi_class="multinomial",
solver=solver,
max_iter=max_iter,
class_weight=class_weight,
n_jobs=n_jobs,
random_state=42,
)
self.is_fitted = False
self.label_mapping = {-1: 0, 0: 1, 1: 2} # 内部类别映射
self.training_metrics = {}
logger.info(
f"Logistic Regression 模型初始化: class_weight={class_weight}, "
f"max_iter={max_iter}, solver={solver}"
)
def train(
self,
X_train,
y_train,
X_val=None,
y_val=None,
verbose: bool = True,
) -> dict[str, Any]:
"""训练模型
Args:
X_train: 训练集特征矩阵
y_train: 训练集标签(-1, 0, 1
X_val: 验证集特征矩阵(可选)
y_val: 验证集标签(可选)
verbose: 是否输出详细日志
Returns:
训练指标字典
"""
start_time = time.time()
logger.info(f"开始训练模型,训练样本数: {len(y_train)}")
# 训练模型
self.clf.fit(X_train, y_train)
self.is_fitted = True
training_time = time.time() - start_time
logger.info(f"模型训练完成,耗时: {training_time:.2f}")
# 计算训练集指标
y_train_pred = self.clf.predict(X_train)
train_accuracy = (y_train_pred == y_train).mean()
metrics = {
"training_time": training_time,
"train_accuracy": train_accuracy,
"train_samples": len(y_train),
}
if verbose:
logger.info(f"训练集准确率: {train_accuracy:.4f}")
logger.info(f"类别分布: {dict(zip(*np.unique(y_train, return_counts=True)))}")
# 如果提供了验证集,计算验证指标
if X_val is not None and y_val is not None:
val_metrics = self.evaluate(X_val, y_val, verbose=verbose)
metrics.update(val_metrics)
self.training_metrics = metrics
return metrics
def evaluate(
self,
X_test,
y_test,
verbose: bool = True,
) -> dict[str, Any]:
"""评估模型
Args:
X_test: 测试集特征矩阵
y_test: 测试集标签
verbose: 是否输出详细日志
Returns:
评估指标字典
"""
if not self.is_fitted:
raise ValueError("模型尚未训练")
y_pred = self.clf.predict(X_test)
accuracy = (y_pred == y_test).mean()
metrics = {
"test_accuracy": accuracy,
"test_samples": len(y_test),
}
if verbose:
logger.info(f"测试集准确率: {accuracy:.4f}")
logger.info("\n分类报告:")
report = classification_report(
y_test,
y_pred,
labels=[-1, 0, 1],
target_names=["不感兴趣(-1)", "中立(0)", "感兴趣(1)"],
zero_division=0,
)
logger.info(f"\n{report}")
logger.info("\n混淆矩阵:")
cm = confusion_matrix(y_test, y_pred, labels=[-1, 0, 1])
logger.info(f"\n{cm}")
return metrics
def predict_proba(self, X) -> np.ndarray:
"""预测概率分布
Args:
X: 特征矩阵
Returns:
概率矩阵,形状为 (n_samples, 3),对应 [-1, 0, 1] 的概率
"""
if not self.is_fitted:
raise ValueError("模型尚未训练")
proba = self.clf.predict_proba(X)
# 确保类别顺序为 [-1, 0, 1]
classes = self.clf.classes_
if not np.array_equal(classes, [-1, 0, 1]):
# 需要重新排序
sorted_proba = np.zeros_like(proba)
for i, cls in enumerate([-1, 0, 1]):
idx = np.where(classes == cls)[0]
if len(idx) > 0:
sorted_proba[:, i] = proba[:, idx[0]]
return sorted_proba
return proba
def predict(self, X) -> np.ndarray:
"""预测类别
Args:
X: 特征矩阵
Returns:
预测标签数组
"""
if not self.is_fitted:
raise ValueError("模型尚未训练")
return self.clf.predict(X)
def get_config(self) -> dict:
"""获取模型配置
Returns:
配置字典
"""
params = self.clf.get_params()
return {
"multi_class": params["multi_class"],
"solver": params["solver"],
"max_iter": params["max_iter"],
"class_weight": params["class_weight"],
"is_fitted": self.is_fitted,
"classes": self.clf.classes_.tolist() if self.is_fitted else None,
}
def train_semantic_model(
texts: list[str],
labels: list[int],
test_size: float = 0.1,
random_state: int = 42,
tfidf_config: dict | None = None,
model_config: dict | None = None,
) -> tuple[TfidfFeatureExtractor, SemanticInterestModel, dict]:
"""训练完整的语义兴趣度模型
Args:
texts: 消息文本列表
labels: 对应的标签列表 (-1, 0, 1)
test_size: 验证集比例
random_state: 随机种子
tfidf_config: TF-IDF 配置
model_config: 模型配置
Returns:
(特征提取器, 模型, 训练指标)
"""
logger.info(f"开始训练语义兴趣度模型,总样本数: {len(texts)}")
# 划分训练集和验证集
X_train_texts, X_val_texts, y_train, y_val = train_test_split(
texts,
labels,
test_size=test_size,
stratify=labels,
random_state=random_state,
)
logger.info(f"训练集: {len(X_train_texts)}, 验证集: {len(X_val_texts)}")
# 初始化并训练 TF-IDF 向量化器
tfidf_config = tfidf_config or {}
feature_extractor = TfidfFeatureExtractor(**tfidf_config)
X_train = feature_extractor.fit_transform(X_train_texts)
X_val = feature_extractor.transform(X_val_texts)
# 初始化并训练模型
model_config = model_config or {}
model = SemanticInterestModel(**model_config)
metrics = model.train(X_train, y_train, X_val, y_val)
logger.info("语义兴趣度模型训练完成")
return feature_extractor, model, metrics