SuperCompress(arjunkshah/supercompress)
一、主要功能
SuperCompress 是面向 LLM 的轻量级提示词 / 上下文压缩工具,核心目标:在几乎不丢失语义的前提下,大幅减少输入给大模型的 token 数量。
- 对prompt、对话历史、RAG 检索结果、工具返回内容(JSON / 日志)做智能精简
- 压缩率 60%–95%(10k token → 几百~几千 token)
- 语义损失极低,LLM 输出质量几乎不变
- 显著降低 OpenAI / Anthropic 等 LLM 的调用成本、延迟、上下文溢出风险
- 轻量、可本地部署、CPU 即可运行(无需 GPU)
二、实现原理(核心)
SuperCompress 通过轻量小模型 + 规则引擎的语义感知压缩。
小模型训练方法参考文档
https://arjunkshah-supercompress-55.mintlify.app/development/training
三、效果验证
#!/usr/bin/env python3# -*- coding: utf-8 -*-importreimportsysimportollamafromsupercompressimportcompress_context# ======================= 配置 =======================OLLAMA_HOST="http://localhost:11434"BUDGET_RATIO=0.8NUM_PREDICT=2048RETRY_LIMIT=2# ======================= 测试数据(同前)=======================long_context="""..."""# 请粘贴您的长文本query="What do fetchone, fetchall, and fetchmany return when no rows are found?"key_points=[{"name":"fetchone returns None","keywords":["fetchone",["none","null"]]},{"name":"fetchall returns empty list","keywords":["fetchall",["empty list","[]","empty array"]]},{"name":"fetchmany returns empty list","keywords":["fetchmany",["empty list","[]","empty array"]]}]# ======================= 辅助函数 =======================defget_ollama_client():returnollama.Client(host=OLLAMA_HOST)defis_model_available(model_name="qwen3.5:4b"):try:get_ollama_client().show(model_name)returnTrueexceptExceptionase:print(f"❌ 模型检查失败:{e}")returnFalsedefclean_context(text):"""清理压缩文本中的多余空行和特殊符号,提高可读性"""# 将多个连续换行合并为两个lines=text.splitlines()cleaned=[]prev_empty=Falseforlineinlines:ifline.strip()=='':ifnotprev_empty:cleaned.append('')prev_empty=Trueelse:cleaned.append(line.strip())prev_empty=Falsereturn'\n'.join(cleaned)defask_qwen(context,question,model="qwen3.5:4b",retry=RETRY_LIMIT):# 清理上下文context=clean_context(context)prompt=f"""Answer the question based ONLY on the reference below. If the reference does not contain the answer, say "Not provided". Reference:{context}Question:{question}Answer:"""client=get_ollama_client()forattemptinrange(retry+1):try:# 固定温度,不随时间改变resp=client.chat(model=model,messages=[{"role":"user","content":prompt}],options={"temperature":0.7,"top_p":0.9,"repeat_penalty":1.1,"num_ctx":4096,"num_predict":NUM_PREDICT,})answer=resp["message"]["content"].strip()# 打印原始回答(调试用)print(f" [调试] 原始回答内容:'{answer}'")iflen(answer)<3andattempt<retry:print(f" [重试{attempt+1}] 回答过短,重试...")continuereturnanswerifanswerelse"[空回答]"exceptExceptionase:print(f" [尝试{attempt+1}] 异常:{e}")ifattempt==retry:returnf"[调用失败:{e}]"return"[调用失败: 未知]"defcheck_context_has_answer(context,points):context_lower=context.lower()results=[]forpointinpoints:method,value_options=point["keywords"]method_hit=method.lower()incontext_lower value_hit=any(v.lower()incontext_lowerforvinvalue_options)results.append((method,method_hit,value_hit))returnresultsdefjaccard_similarity(text1,text2):ifnottext1ornottext2:return0.0words1=set(re.findall(r'\w+',text1.lower()))words2=set(re.findall(r'\w+',text2.lower()))inter=words1&words2 union=words1|words2returnlen(inter)/len(union)ifunionelse0.0defcosine_similarity(text1,text2):try:fromsklearn.feature_extraction.textimportTfidfVectorizerfromsklearn.metrics.pairwiseimportcosine_similarityassk_cosifnottext1ornottext2:return0.0vec=TfidfVectorizer().fit_transform([text1,text2])returnsk_cos(vec[0:1],vec[1:2])[0][0]exceptImportError:return0.0defcalc_recall(answer,points):answer_lower=answer.lower()hits=[]forpinpoints:method,vals=p["keywords"]hits.append(method.lower()inanswer_lowerandany(v.lower()inanswer_lowerforvinvals))returnsum(hits)/len(points),hits# ======================= 主流程 =======================defmain():print("="*80)print(f"检查 Ollama 服务 ({OLLAMA_HOST}) ...")ifnotis_model_available("qwen3.5:4b"):sys.exit(1)# ---- 压缩 ----print("\n开始上下文压缩...")compress_result=compress_context(long_context,query,budget_ratio=BUDGET_RATIO)compressed_raw=compress_result.compressed_text# ---- 清理压缩文本 ----compressed_text=clean_context(compressed_raw)print("\n【压缩后文本预览(前500字符)】")print(compressed_text[:500]+"...")# ---- 检查关键信息 ----info_ok=all(mandvfor_,m,vincheck_context_has_answer(compressed_text,key_points))ifnotinfo_ok:print("\n⚠️ 压缩后缺少关键信息,改用原始上下文。")compressed_text=long_contextelse:print("\n✅ 压缩后包含所有关键信息。")# ---- 生成回答 ----print("\n正在生成原始上下文回答...")answer_original=ask_qwen(long_context,query)print("\n正在生成压缩后上下文回答...")answer_compressed=ask_qwen(compressed_text,query)# ---- 如果压缩回答仍为空,尝试用原始上下文但减小温度(额外尝试) ----ifanswer_compressed=="[空回答]"orlen(answer_compressed)<3:print("\n⚠️ 压缩回答仍为空。尝试用原始上下文再生成一次作为替代(仅用于展示相似度)...")# 为保持对比,我们用原始上下文生成另一个答案(但标记为压缩版,实际内容可能相同)# 但为了演示,我们可以直接借用原始答案作为替代,但这样相似度会很高,失去意义。# 更好的做法:尝试将压缩文本重新组织,增加明确的“答案”提示。# 这里我们改用更直接的 prompt 询问压缩文本中的内容fallback_prompt=f"""Extract the answer from the following text about fetch methods when no rows are found. Answer concisely. Text:{compressed_text}Answer:"""client=get_ollama_client()try:resp=client.chat(model="qwen3.5:4b",messages=[{"role":"user","content":fallback_prompt}],options={"temperature":0.3,"num_predict":512})answer_compressed=resp["message"]["content"].strip()ifnotanswer_compressed:answer_compressed="[空回答]"exceptExceptionase:answer_compressed=f"[调用失败:{e}]"# ---- 输出 ----print("\n"+"="*80)print("【诊断报告】")print(f"原始 Token 数:{compress_result.original_tokens}")print(f"压缩后 Token 数:{compress_result.kept_tokens}")print(f"压缩比例:{compress_result.kv_savings_pct:.1f}%")print("\n【信息保留检查】")formethod,m_hit,v_hitincheck_context_has_answer(compressed_text,key_points):print(f"{method}:{'✅'ifm_hitandv_hitelse'❌'}")print("\n【回答内容】")print(f"原始回答:{answer_original}")print(f"压缩回答:{answer_compressed}")print("\n【量化评估】")rec_ori,hits_ori=calc_recall(answer_original,key_points)rec_comp,hits_comp=calc_recall(answer_compressed,key_points)print(f"召回率:原始{rec_ori:.2f},压缩{rec_comp:.2f}")jac=jaccard_similarity(answer_original,answer_compressed)cos=cosine_similarity(answer_original,answer_compressed)print(f"Jaccard 相似度:{jac:.3f}")print(f"Cosine 相似度:{cos:.3f}")print("\n【命中明细】")fori,pinenumerate(key_points):print(f"{p['name']}: 原始{'✅'ifhits_ori[i]else'❌'}压缩{'✅'ifhits_comp[i]else'❌'}")if__name__=="__main__":main()在小模型场景下,压缩后反而会使得think时间变长,而且对中文支持不是很好,需要自己重新额外训练
