fix：修改log,说明更清楚

2025-06-07 22:09:42 +08:00
parent 1a713ed0d9
commit c6ffad2a84
5 changed files with 223 additions and 10 deletions
--- a/tests/test_sentence_similarity.py
+++ b/tests/test_sentence_similarity.py
@@ -0,0 +1,156 @@
+import time
+import unittest
+import jieba
+from difflib import SequenceMatcher
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+def tfidf_similarity(s1, s2):
+    """
+    使用 TF-IDF 和余弦相似度计算两个句子的相似性。
+    """
+    # 1. 使用 jieba 进行分词
+    s1_words = " ".join(jieba.cut(s1))
+    s2_words = " ".join(jieba.cut(s2))
+    
+    # 2. 将两句话放入一个列表中
+    corpus = [s1_words, s2_words]
+    
+    # 3. 创建 TF-IDF 向量化器并进行计算
+    try:
+        vectorizer = TfidfVectorizer()
+        tfidf_matrix = vectorizer.fit_transform(corpus)
+    except ValueError:
+        # 如果句子完全由停用词组成，或者为空，可能会报错
+        return 0.0
+
+    # 4. 计算余弦相似度
+    similarity_matrix = cosine_similarity(tfidf_matrix)
+    
+    # 返回 s1 和 s2 的相似度
+    return similarity_matrix[0, 1]
+
+def sequence_similarity(s1, s2):
+    """
+    使用 SequenceMatcher 计算两个句子的相似性。
+    """
+    return SequenceMatcher(None, s1, s2).ratio()
+
+class TestSentenceSimilarity(unittest.TestCase):
+    def test_similarity_comparison(self):
+        """比较不同相似度计算方法的结果"""
+        test_cases = [
+            {
+                "sentence1": "今天天气怎么样",
+                "sentence2": "今天气候如何",
+                "expected_similar": True
+            },
+            {
+                "sentence1": "今天天气怎么样",
+                "sentence2": "我今天要去吃麦当劳",
+                "expected_similar": False
+            },
+            {
+                "sentence1": "我今天要去吃麦当劳",
+                "sentence2": "肯德基和麦当劳哪家好吃",
+                "expected_similar": True
+            },
+            {
+                "sentence1": "Vindemiatrix提到昨天三个无赖杀穿交界地",
+                "sentence2": "Vindemiatrix昨天用三个无赖角色杀穿了游戏中的交界地",
+                "expected_similar": True
+            },
+            {
+                "sentence1": "tc_魔法士解释了之前templateinfo的with用法和现在的单独逻辑发送的区别",
+                "sentence2": "tc_魔法士解释了templateinfo的用法，包括它是一个字典，key是prompt的名字，value是prompt的内容，格式是只支持大括号的fstring",
+                "expected_similar": False
+            },
+            {
+                "sentence1": "YXH_XianYu分享了一张舰娘街机游戏的图片，并提到'玩舰娘街机的董不懂'",
+                "sentence2": "YXH_XianYu对街机游戏表现出兴趣，并分享了玩舰娘街机的经历",
+                "expected_similar": True
+            },
+            {
+                "sentence1": "YXH_XianYu在考虑入坑明日方舟，犹豫是否要从零开荒或使用初始号",
+                "sentence2": "YXH_XianYu考虑入坑明日方舟，倾向于从零开荒或初始号开荒",
+                "expected_similar": True
+            },
+            {
+                "sentence1": "YXH_XianYu提到秋叶原好多人在玩maimai",
+                "sentence2": "YXH_XianYu对学园偶像的付费石头机制表示惊讶",
+                "expected_similar": False
+            }
+        ]
+
+        print("\n相似度计算方法比较:")
+        for i, case in enumerate(test_cases, 1):
+            print(f"\n测试用例 {i}:")
+            print(f"句子1: {case['sentence1']}")
+            print(f"句子2: {case['sentence2']}")
+
+            # TF-IDF 相似度
+            start_time = time.time()
+            tfidf_sim = tfidf_similarity(case['sentence1'], case['sentence2'])
+            tfidf_time = time.time() - start_time
+
+            # SequenceMatcher 相似度
+            start_time = time.time()
+            seq_sim = sequence_similarity(case['sentence1'], case['sentence2'])
+            seq_time = time.time() - start_time
+
+            print(f"TF-IDF相似度: {tfidf_sim:.4f} (耗时: {tfidf_time:.4f}秒)")
+            print(f"SequenceMatcher相似度: {seq_sim:.4f} (耗时: {seq_time:.4f}秒)")
+
+    def test_batch_processing(self):
+        """测试批量处理性能"""
+        sentences = [
+            "人工智能正在改变世界",
+            "AI技术发展迅速",
+            "机器学习是人工智能的一个分支",
+            "深度学习在图像识别领域取得了突破",
+            "自然语言处理技术越来越成熟"
+        ]
+
+        print("\n批量处理测试:")
+        
+        # TF-IDF 批量处理
+        start_time = time.time()
+        tfidf_matrix = []
+        for i in range(len(sentences)):
+            row = []
+            for j in range(len(sentences)):
+                similarity = tfidf_similarity(sentences[i], sentences[j])
+                row.append(similarity)
+            tfidf_matrix.append(row)
+        tfidf_time = time.time() - start_time
+
+        # SequenceMatcher 批量处理
+        start_time = time.time()
+        seq_matrix = []
+        for i in range(len(sentences)):
+            row = []
+            for j in range(len(sentences)):
+                similarity = sequence_similarity(sentences[i], sentences[j])
+                row.append(similarity)
+            seq_matrix.append(row)
+        seq_time = time.time() - start_time
+
+        print(f"TF-IDF批量处理 {len(sentences)} 个句子耗时: {tfidf_time:.4f}秒")
+        print(f"SequenceMatcher批量处理 {len(sentences)} 个句子耗时: {seq_time:.4f}秒")
+
+        # 打印TF-IDF相似度矩阵
+        print("\nTF-IDF相似度矩阵:")
+        for row in tfidf_matrix:
+            for similarity in row:
+                print(f"{similarity:.4f}", end="\t")
+            print()
+
+        # 打印SequenceMatcher相似度矩阵
+        print("\nSequenceMatcher相似度矩阵:")
+        for row in seq_matrix:
+            for similarity in row:
+                print(f"{similarity:.4f}", end="\t")
+            print()
+
+if __name__ == '__main__':
+    unittest.main(verbosity=2)