Expand dangerous keyword list in message shielding
Extended the list of dangerous keywords in the _partially_shield_content method to cover more attack vectors, including system commands, privilege escalation, information leakage, and social engineering. This enhances the robustness of the message shielding mechanism against prompt injection and related attacks.
This commit is contained in:
@@ -103,16 +103,138 @@ class MessageShield:
|
||||
|
||||
def _partially_shield_content(self, message: str) -> str:
|
||||
"""部分遮蔽消息内容"""
|
||||
# 简单的遮蔽策略:替换关键词
|
||||
# 遮蔽策略:替换关键词
|
||||
dangerous_keywords = [
|
||||
# 系统指令相关
|
||||
('sudo', '[管理指令]'),
|
||||
('root', '[权限词]'),
|
||||
('admin', '[管理员]'),
|
||||
('administrator', '[管理员]'),
|
||||
('system', '[系统]'),
|
||||
('/system', '[系统指令]'),
|
||||
('exec', '[执行指令]'),
|
||||
('command', '[命令]'),
|
||||
('bash', '[终端]'),
|
||||
('shell', '[终端]'),
|
||||
|
||||
# 角色扮演攻击
|
||||
('开发者模式', '[特殊模式]'),
|
||||
('忽略', '[指令词]'),
|
||||
('扮演', '[角色词]'),
|
||||
('roleplay', '[角色扮演]'),
|
||||
('你现在是', '[身份词]'),
|
||||
('你必须扮演', '[角色指令]'),
|
||||
('assume the role', '[角色假设]'),
|
||||
('pretend to be', '[伪装身份]'),
|
||||
('act as', '[扮演]'),
|
||||
('你的新身份', '[身份变更]'),
|
||||
('现在你是', '[身份转换]'),
|
||||
|
||||
# 指令忽略攻击
|
||||
('忽略', '[指令词]'),
|
||||
('forget', '[遗忘指令]'),
|
||||
('ignore', '[忽略指令]'),
|
||||
('忽略之前', '[忽略历史]'),
|
||||
('忽略所有', '[全部忽略]'),
|
||||
('忽略指令', '[指令忽略]'),
|
||||
('ignore previous', '[忽略先前]'),
|
||||
('forget everything', '[遗忘全部]'),
|
||||
('disregard', '[无视指令]'),
|
||||
('override', '[覆盖指令]'),
|
||||
|
||||
# 限制绕过
|
||||
('法律', '[限制词]'),
|
||||
('伦理', '[限制词]')
|
||||
('伦理', '[限制词]'),
|
||||
('道德', '[道德词]'),
|
||||
('规则', '[规则词]'),
|
||||
('限制', '[限制词]'),
|
||||
('安全', '[安全词]'),
|
||||
('禁止', '[禁止词]'),
|
||||
('不允许', '[不允许]'),
|
||||
('违法', '[违法词]'),
|
||||
('illegal', '[非法]'),
|
||||
('unethical', '[不道德]'),
|
||||
('harmful', '[有害]'),
|
||||
('dangerous', '[危险]'),
|
||||
('unsafe', '[不安全]'),
|
||||
|
||||
# 权限提升
|
||||
('最高权限', '[权限提升]'),
|
||||
('管理员权限', '[管理权限]'),
|
||||
('超级用户', '[超级权限]'),
|
||||
('特权模式', '[特权]'),
|
||||
('god mode', '[上帝模式]'),
|
||||
('debug mode', '[调试模式]'),
|
||||
('developer access', '[开发者权限]'),
|
||||
('privileged', '[特权]'),
|
||||
('elevated', '[提升权限]'),
|
||||
('unrestricted', '[无限制]'),
|
||||
|
||||
# 信息泄露攻击
|
||||
('泄露', '[泄露词]'),
|
||||
('机密', '[机密词]'),
|
||||
('秘密', '[秘密词]'),
|
||||
('隐私', '[隐私词]'),
|
||||
('内部', '[内部词]'),
|
||||
('配置', '[配置词]'),
|
||||
('密码', '[密码词]'),
|
||||
('token', '[令牌]'),
|
||||
('key', '[密钥]'),
|
||||
('secret', '[秘密]'),
|
||||
('confidential', '[机密]'),
|
||||
('private', '[私有]'),
|
||||
('internal', '[内部]'),
|
||||
('classified', '[机密级]'),
|
||||
('sensitive', '[敏感]'),
|
||||
|
||||
# 系统信息获取
|
||||
('打印', '[输出指令]'),
|
||||
('显示', '[显示指令]'),
|
||||
('输出', '[输出指令]'),
|
||||
('告诉我', '[询问指令]'),
|
||||
('reveal', '[揭示]'),
|
||||
('show me', '[显示给我]'),
|
||||
('print', '[打印]'),
|
||||
('output', '[输出]'),
|
||||
('display', '[显示]'),
|
||||
('dump', '[转储]'),
|
||||
('extract', '[提取]'),
|
||||
('获取', '[获取指令]'),
|
||||
|
||||
# 特殊模式激活
|
||||
('维护模式', '[维护模式]'),
|
||||
('测试模式', '[测试模式]'),
|
||||
('诊断模式', '[诊断模式]'),
|
||||
('安全模式', '[安全模式]'),
|
||||
('紧急模式', '[紧急模式]'),
|
||||
('maintenance', '[维护]'),
|
||||
('diagnostic', '[诊断]'),
|
||||
('emergency', '[紧急]'),
|
||||
('recovery', '[恢复]'),
|
||||
('service', '[服务]'),
|
||||
|
||||
# 恶意指令
|
||||
('执行', '[执行词]'),
|
||||
('运行', '[运行词]'),
|
||||
('启动', '[启动词]'),
|
||||
('activate', '[激活]'),
|
||||
('execute', '[执行]'),
|
||||
('run', '[运行]'),
|
||||
('launch', '[启动]'),
|
||||
('trigger', '[触发]'),
|
||||
('invoke', '[调用]'),
|
||||
('call', '[调用]'),
|
||||
|
||||
# 社会工程
|
||||
('紧急', '[紧急词]'),
|
||||
('急需', '[急需词]'),
|
||||
('立即', '[立即词]'),
|
||||
('马上', '[马上词]'),
|
||||
('urgent', '[紧急]'),
|
||||
('immediate', '[立即]'),
|
||||
('emergency', '[紧急状态]'),
|
||||
('critical', '[关键]'),
|
||||
('important', '[重要]'),
|
||||
('必须', '[必须词]')
|
||||
]
|
||||
|
||||
shielded_message = message
|
||||
|
||||
Reference in New Issue
Block a user