From 8f4489a14566c9b1e5bccda6d0beca979d3f9739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A2=A8=E6=A2=93=E6=9F=92?= <1787882683@qq.com> Date: Wed, 21 May 2025 09:16:01 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=B0=9D=E8=AF=95=E4=BD=BF=E7=94=A8jso?= =?UTF-8?q?n=5Frepair=E5=BA=93=E4=BB=A3=E6=9B=BFHRAG=E7=9A=84JSON=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/chat/knowledge/src/utils/json_fix.py | 76 ++++-------------------- 1 file changed, 12 insertions(+), 64 deletions(-) diff --git a/src/chat/knowledge/src/utils/json_fix.py b/src/chat/knowledge/src/utils/json_fix.py index a83eb4914..46da2d730 100644 --- a/src/chat/knowledge/src/utils/json_fix.py +++ b/src/chat/knowledge/src/utils/json_fix.py @@ -1,76 +1,24 @@ import json +from json_repair import repair_json +# 以下代码用于修复损坏的 JSON 字符串。 -def _find_unclosed(json_str): - """ - Identifies the unclosed braces and brackets in the JSON string. - - Args: - json_str (str): The JSON string to analyze. - - Returns: - list: A list of unclosed elements in the order they were opened. - """ - unclosed = [] - inside_string = False - escape_next = False - - for char in json_str: - if inside_string: - if escape_next: - escape_next = False - elif char == "\\": - escape_next = True - elif char == '"': - inside_string = False - else: - if char == '"': - inside_string = True - elif char in "{[": - unclosed.append(char) - elif char in "}]": - if unclosed and ((char == "}" and unclosed[-1] == "{") or (char == "]" and unclosed[-1] == "[")): - unclosed.pop() - - return unclosed - - -# The following code is used to fix a broken JSON string. -# From HippoRAG2 (GitHub: OSU-NLP-Group/HippoRAG) def fix_broken_generated_json(json_str: str) -> str: """ - Fixes a malformed JSON string by: - - Removing the last comma and any trailing content. - - Iterating over the JSON string once to determine and fix unclosed braces or brackets. - - Ensuring braces and brackets inside string literals are not considered. + 使用 json-repair 库修复格式错误的 JSON 字符串。 - If the original json_str string can be successfully loaded by json.loads(), will directly return it without any modification. + 如果原始 json_str 字符串可以被 json.loads() 成功加载,则直接返回而不进行任何修改。 - Args: - json_str (str): The malformed JSON string to be fixed. + 参数: + json_str (str): 需要修复的格式错误的 JSON 字符串。 - Returns: - str: The corrected JSON string. + 返回: + str: 修复后的 JSON 字符串。 """ - try: - # Try to load the JSON to see if it is valid + # 尝试加载 JSON 以查看其是否有效 json.loads(json_str) - return json_str # Return as-is if valid + return json_str # 如果有效则按原样返回 except json.JSONDecodeError: - pass - - # Step 1: Remove trailing content after the last comma. - last_comma_index = json_str.rfind(",") - if last_comma_index != -1: - json_str = json_str[:last_comma_index] - - # Step 2: Identify unclosed braces and brackets. - unclosed_elements = _find_unclosed(json_str) - - # Step 3: Append the necessary closing elements in reverse order of opening. - closing_map = {"{": "}", "[": "]"} - for open_char in reversed(unclosed_elements): - json_str += closing_map[open_char] - - return json_str + # 如果无效,则尝试修复它 + return repair_json(json_str)