From 89bee8db90a42ff5d145033e3faf6ffca6ccde56 Mon Sep 17 00:00:00 2001 From: tt-P607 <68868379+tt-P607@users.noreply.github.com> Date: Mon, 25 Aug 2025 01:21:50 +0800 Subject: [PATCH] =?UTF-8?q?feat(video):=20=E6=96=B0=E5=A2=9E=E6=8C=89?= =?UTF-8?q?=E6=97=B6=E9=97=B4=E9=97=B4=E9=9A=94=E6=8A=BD=E5=B8=A7=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加了按时间间隔提取视频帧的新模式,与原有的固定总帧数模式并存。用户现在可以通过配置文件选择最适合其需求的抽帧方式。 - 新增 `frame_extraction_mode` 配置项,可选值为 "fixed_number" (固定总帧数) 或 "time_interval" (按时间间隔)。 - 新增 `frame_interval_seconds` 配置项,用于指定时间间隔模式下的抽帧秒数。 - `max_frames` 配置项现在仅在 "fixed_number" 模式下生效。 - 更新了 `VideoAnalyzer` 类以支持两种抽帧逻辑,并从配置中读取新参数。 - 相应地更新了配置文件模板 `bot_config_template.toml`。 --- src/chat/utils/utils_video.py | 71 ++++++++++++++++++----- src/plugins/built_in/tts_plugin/plugin.py | 2 + template/bot_config_template.toml | 4 +- 3 files changed, 60 insertions(+), 17 deletions(-) diff --git a/src/chat/utils/utils_video.py b/src/chat/utils/utils_video.py index f68118580..d5333ee86 100644 --- a/src/chat/utils/utils_video.py +++ b/src/chat/utils/utils_video.py @@ -61,6 +61,8 @@ class VideoAnalyzer: self.max_image_size = config.max_image_size self.enable_frame_timing = config.enable_frame_timing self.batch_analysis_prompt = config.batch_analysis_prompt + self.frame_extraction_mode = config.frame_extraction_mode + self.frame_interval_seconds = config.frame_interval_seconds # 将配置文件中的模式映射到内部使用的模式名称 config_mode = config.analysis_mode @@ -92,6 +94,8 @@ class VideoAnalyzer: self.batch_size = 3 # 批处理时每批处理的帧数 self.timeout = 60.0 # 分析超时时间(秒) self.enable_frame_timing = True + self.frame_extraction_mode = "fixed_number" + self.frame_interval_seconds = 2.0 self.batch_analysis_prompt = """请分析这个视频的内容。这些图片是从视频中按时间顺序提取的关键帧。 请提供详细的分析,包括: @@ -191,24 +195,59 @@ class VideoAnalyzer: logger.info(f"视频信息: {total_frames}帧, {fps:.2f}FPS, {duration:.2f}秒") - # 动态计算帧间隔 - if duration > 0: - frame_interval = max(1, int(duration / self.max_frames * fps)) - else: - frame_interval = 30 # 默认间隔 - frame_count = 0 extracted_count = 0 - while cap.isOpened() and extracted_count < self.max_frames: - ret, frame = cap.read() - if not ret: - break + if self.frame_extraction_mode == "time_interval": + # 新模式:按时间间隔抽帧 + time_interval = self.frame_interval_seconds + next_frame_time = 0.0 + + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break - if frame_count % frame_interval == 0: - # 转换为PIL图像并压缩 - frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - pil_image = Image.fromarray(frame_rgb) + current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0 + + if current_time >= next_frame_time: + # 转换为PIL图像并压缩 + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + pil_image = Image.fromarray(frame_rgb) + + # 调整图像大小 + if max(pil_image.size) > self.max_image_size: + ratio = self.max_image_size / max(pil_image.size) + new_size = tuple(int(dim * ratio) for dim in pil_image.size) + pil_image = pil_image.resize(new_size, Image.Resampling.LANCZOS) + + # 转换为base64 + buffer = io.BytesIO() + pil_image.save(buffer, format='JPEG', quality=self.frame_quality) + frame_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8') + + frames.append((frame_base64, current_time)) + extracted_count += 1 + + logger.debug(f"提取第{extracted_count}帧 (时间: {current_time:.2f}s)") + + next_frame_time += time_interval + else: + # 旧模式:固定总帧数 + if duration > 0: + frame_interval = max(1, int(total_frames / self.max_frames)) + else: + frame_interval = 1 # 如果无法获取时长,则逐帧提取直到达到max_frames + + while cap.isOpened() and extracted_count < self.max_frames: + ret, frame = cap.read() + if not ret: + break + + if frame_count % frame_interval == 0: + # 转换为PIL图像并压缩 + frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + pil_image = Image.fromarray(frame_rgb) # 调整图像大小 if max(pil_image.size) > self.max_image_size: @@ -227,8 +266,8 @@ class VideoAnalyzer: extracted_count += 1 logger.debug(f"提取第{extracted_count}帧 (时间: {timestamp:.2f}s)") - - frame_count += 1 + + frame_count += 1 cap.release() logger.info(f"✅ 成功提取{len(frames)}帧") diff --git a/src/plugins/built_in/tts_plugin/plugin.py b/src/plugins/built_in/tts_plugin/plugin.py index 4e4d3648b..30748a9ff 100644 --- a/src/plugins/built_in/tts_plugin/plugin.py +++ b/src/plugins/built_in/tts_plugin/plugin.py @@ -34,6 +34,8 @@ class TTSAction(BaseAction): # 动作使用场景 action_require = [ "当需要发送语音信息时使用", + "当用户要求你说话时使用", + "当用户要求听你声音时使用", "当用户明确要求使用语音功能时使用", "当表达内容更适合用语音而不是文字传达时使用", "当用户想听到语音回答而非阅读文本时使用", diff --git a/template/bot_config_template.toml b/template/bot_config_template.toml index 2b0e165bc..09f755c06 100644 --- a/template/bot_config_template.toml +++ b/template/bot_config_template.toml @@ -382,7 +382,9 @@ enable_friend_chat = false # 是否启用好友聊天 [video_analysis] # 视频分析配置 enable = true # 是否启用视频分析功能 analysis_mode = "batch_frames" # 分析模式:"frame_by_frame"(逐帧分析,非常慢 "建议frames大于8时不要使用这个" ...但是详细)、"batch_frames"(批量分析,快但可能略简单 -其实效果也差不多)或 "auto"(自动选择) -max_frames = 16 # 最大分析帧数 +frame_extraction_mode = "fixed_number" # 抽帧模式: "fixed_number" (固定总帧数) 或 "time_interval" (按时间间隔) +frame_interval_seconds = 2.0 # 按时间间隔抽帧的秒数(仅在 mode = "time_interval" 时生效) +max_frames = 16 # 最大分析帧数(仅在 mode = "fixed_number" 时生效) frame_quality = 80 # 帧图像JPEG质量 (1-100) max_image_size = 800 # 单帧最大图像尺寸(像素) enable_frame_timing = true # 是否在分析中包含帧的时间信息