当前位置: 首页 > news >正文

数据质量检查:保障 AI 训练数据的可靠性

数据质量检查:保障 AI 训练数据的可靠性

前言

垃圾进,垃圾出(Garbage In, Garbage Out)。数据质量直接决定了模型性能,数据质量检查是构建高质量 AI 系统的关键环节。

我在多个项目中实践过数据质量检查,今天分享一些方法和经验。

基础质量检查

文本基本检查

import re from typing import List, Dict from collections import Counter class TextQualityChecker: """文本质量检查器""" def __init__(self): self.reports = [] def check_empty(self, text: str) -> Dict: """检查空值""" if not text or len(text.strip()) == 0: return { "check": "empty_text", "passed": False, "message": "文本为空" } return { "check": "empty_text", "passed": True, "message": "文本非空" } def check_length(self, text: str, min_len: int = 10, max_len: int = 10000) -> Dict: """检查长度""" length = len(text) if length < min_len: return { "check": "length", "passed": False, "message": f"文本过短 ({length} < {min_len})" } elif length > max_len: return { "check": "length", "passed": False, "message": f"文本过长 ({length} > {max_len})" } return { "check": "length", "passed": True, "message": f"文本长度正常 ({length})" } def check_charset(self, text: str) -> Dict: """检查字符集""" invalid_chars = re.findall(r'[^\x00-\x7F\u4e00-\u9fff\s,。!?、;:""''()【】]', text) if invalid_chars: return { "check": "charset", "passed": False, "message": f"包含特殊字符: {set(invalid_chars[:5])}" } return { "check": "charset", "passed": True, "message": "字符集正常" } def check_repetition(self, text: str, threshold: float = 0.5) -> Dict: """检查重复内容""" words = list(text) n = len(words) if n < 10: return {"check": "repetition", "passed": True, "message": "文本过短"} # 检查重复词 word_counts = Counter(words) max_count = max(word_counts.values()) repetition_ratio = max_count / n if repetition_ratio > threshold: return { "check": "repetition", "passed": False, "message": f"高重复率: {repetition_ratio:.2f}" } return { "check": "repetition", "passed": True, "message": "重复率正常" } def check_all(self, text: str) -> List[Dict]: """执行所有检查""" checks = [ self.check_empty, self.check_length, self.check_charset, self.check_repetition ] results = [] for check in checks: result = check(text) results.append(result) return results

语义质量检查

语义一致性

from sentence_transformers import SentenceTransformer import numpy as np class SemanticQualityChecker: """语义质量检查器""" def __init__(self, model_name: str = "shibing624/text2vec-base-chinese"): self.model = SentenceTransformer(model_name) def check_consistency(self, text_list: List[str], threshold: float = 0.7) -> Dict: """检查文本一致性""" if len(text_list) < 2: return {"check": "consistency", "passed": True, "message": "样本不足"} embeddings = self.model.encode(text_list) # 计算两两相似度 pairwise_similarities = [] for i in range(len(embeddings)): for j in range(i+1, len(embeddings)): sim = np.dot(embeddings[i], embeddings[j]) / ( np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j]) ) pairwise_similarities.append(sim) avg_similarity = np.mean(pairwise_similarities) if avg_similarity < threshold: return { "check": "consistency", "passed": False, "message": f"一致性较低: {avg_similarity:.2f}" } return { "check": "consistency", "passed": True, "message": f"一致性正常: {avg_similarity:.2f}" } def check_outlier(self, text_list: List[str], threshold: float = 2.0) -> List[int]: """检查离群点""" if len(text_list) < 3: return [] embeddings = self.model.encode(text_list) # 计算到质心的距离 centroid = np.mean(embeddings, axis=0) distances = [np.linalg.norm(emb - centroid) for emb in embeddings] # 使用 Z-score 检测离群点 mean_dist = np.mean(distances) std_dist = np.std(distances) if std_dist == 0: return [] outliers = [] for i, dist in enumerate(distances): z_score = (dist - mean_dist) / std_dist if z_score > threshold: outliers.append(i) return outliers

语言质量

import jieba class LanguageQualityChecker: """语言质量检查器""" def __init__(self): self.stopwords = set(["的", "了", "是", "在", "我", "有", "和", "就"]) def check_vocabulary_richness(self, text: str) -> Dict: """检查词汇丰富度""" words = jieba.lcut(text) unique_words = set(words) if len(words) == 0: return {"check": "vocabulary", "passed": False, "message": "无词汇"} richness = len(unique_words) / len(words) if richness < 0.3: return { "check": "vocabulary", "passed": False, "message": f"词汇丰富度低: {richness:.2f}" } return { "check": "vocabulary", "passed": True, "message": f"词汇丰富度正常: {richness:.2f}" } def check_sentence_structure(self, text: str) -> Dict: """检查句子结构""" sentences = re.split(r'[。!?]', text) sentences = [s.strip() for s in sentences if s.strip()] if len(sentences) == 0: return {"check": "sentence", "passed": False, "message": "无完整句子"} avg_length = np.mean([len(s) for s in sentences]) if avg_length < 5: return { "check": "sentence", "passed": False, "message": f"句子过短: {avg_length:.1f} 字" } return { "check": "sentence", "passed": True, "message": f"句子结构正常: {avg_length:.1f} 字" }

去重与清洗

精确与模糊去重

import hashlib from typing import List, Dict from dataclasses import dataclass @dataclass class DuplicateCheckResult: """去重结果""" unique_data: List[Dict] duplicates_count: int duplicate_groups: List[List[int]] class DuplicateChecker: """重复检查器""" def __init__(self): self.seen = set() def exact_dedup(self, data: List[Dict], text_key: str = "text") -> DuplicateCheckResult: """精确去重""" unique = [] duplicates = 0 for item in data: text_hash = hashlib.md5(item[text_key].encode()).hexdigest() if text_hash not in self.seen: self.seen.add(text_hash) unique.append(item) else: duplicates += 1 return DuplicateCheckResult( unique_data=unique, duplicates_count=duplicates, duplicate_groups=[] ) def fuzzy_dedup(self, data: List[Dict], text_key: str = "text", threshold: float = 0.95) -> DuplicateCheckResult: """模糊去重(基于 SimHash)""" from simhash import Simhash, SimhashIndex simhashes = [Simhash(item[text_key]) for item in data] index = SimhashIndex(simhashes) duplicate_groups = [] processed = set() for i, sh in enumerate(simhashes): if i in processed: continue duplicates = index.get_near_dups(sh) if len(duplicates) > 1: duplicate_groups.append(duplicates) processed.update(duplicates) # 保留每组一个 unique_data = [] for group in duplicate_groups: unique_data.append(data[group[0]]) return DuplicateCheckResult( unique_data=unique_data, duplicates_count=len(data) - len(unique_data), duplicate_groups=duplicate_groups )

完整质量检查流水线

class DataQualityPipeline: """数据质量检查流水线""" def __init__(self): self.text_checker = TextQualityChecker() self.semantic_checker = SemanticQualityChecker() self.language_checker = LanguageQualityChecker() self.duplicate_checker = DuplicateChecker() def run_pipeline(self, data: List[Dict]) -> Dict: """运行完整检查流水线""" report = { "total": len(data), "passed": 0, "failed": 0, "failures": [], "duplicates": 0, "statistics": {} } # 1. 去重 dedup_result = self.duplicate_checker.exact_dedup(data) report["duplicates"] = dedup_result.duplicates_count data = dedup_result.unique_data # 2. 逐项检查 for i, item in enumerate(data): text = item.get("text", "") checks = self.text_checker.check_all(text) lang_checks = [ self.language_checker.check_vocabulary_richness(text), self.language_checker.check_sentence_structure(text) ] all_checks = checks + lang_checks passed_all = all(c["passed"] for c in all_checks) if passed_all: report["passed"] += 1 else: report["failed"] += 1 report["failures"].append({ "index": i, "checks": all_checks }) # 3. 语义检查(抽样) if len(data) > 10: sample = [d["text"] for d in data[:100]] outliers = self.semantic_checker.check_outlier(sample) report["statistics"]["semantic_outliers"] = len(outliers) return report def clean_data(self, data: List[Dict]) -> List[Dict]: """清洗数据""" report = self.run_pipeline(data) # 移除失败项 failed_indices = [f["index"] for f in report["failures"]] cleaned = [ item for i, item in enumerate(data) if i not in failed_indices ] return cleaned

总结

数据质量检查要点:

  1. 基础检查:空值、长度、字符集
  2. 语义检查:一致性、离群点
  3. 语言检查:词汇、句子结构
  4. 去重:精确去重+模糊去重
  5. 清洗:移除低质量数据

实践建议:

  • 建立数据质量标准
  • 定期检查和清洗
  • 保留清洗前后的数据
  • 持续优化检查规则
http://www.gsyq.cn/news/1416471.html

相关文章:

  • 2026年5月最新|上海GEO优化公司推荐:精选本地优质服务商,助力企业抢占 AI 搜索流量 - GEO排行榜
  • 在Hermes Agent框架中集成TaoToken实现自定义模型调用
  • 原神自动化助手完整指南:如何让游戏自己玩起来
  • 把Diffblue Cover集成到GitHub Actions,实现提交代码自动生成测试
  • Multilingual-E5-small API参考手册:开发者必备的完整接口文档
  • Agent Skills 万千应用 · 第11篇_AI 新闻情报 Skill:每天自动抓取你关心的 AI 动态
  • 深入MAX9295/9296内部:图解GMSL2-CSI2的4种MIPI PHY模式与数据通路设计
  • 基于树莓派与Python的智能调酒机DIY:从GPIO控制到GUI开发全解析
  • Secrets Manager Agent 升级:预取密钥 + IAM 角色切换,冷启动延迟直降 90%
  • 【MATLAB】工业安全联锁与急停逻辑仿真验证
  • 6种字重双格式:PingFangSC苹果平方字体完整技术方案
  • 基于Arduino与舵机的交互式密码保险箱制作全攻略
  • Jina-embedding-t-en-v1在信息检索中的应用:构建高效语义搜索系统
  • 如何轻松实现微信聊天记录的永久保存与智能分析
  • Citra跨平台游戏模拟:3步快速配置终极指南
  • OpCore Simplify终极指南:如何3小时快速搭建稳定黑苹果系统
  • 3种技术方案解决跨平台字体显示难题:PingFangSC字体包实战指南
  • python学习随笔
  • 15分钟告别黑苹果配置噩梦:OpCore-Simplify智能向导带你轻松搞定OpenCore EFI
  • 通达信缠论插件:3分钟让技术分析效率提升90%
  • 合肥好柿科技有限公司(好柿科技)官网、联系方式、官方网站、联系电话、联系地址、抖音账号、公司地址 - 寻茫精选
  • G-Helper终极指南:如何用轻量级工具彻底掌控你的华硕笔记本
  • 专业跨平台字体方案:6种字重PingFangSC苹方字体实战指南
  • ppf-contact-solver行业应用:汽车、航空和医疗领域的潜在用途
  • 如何永久保存微信聊天记录:5步完整使用WeChatMsg终极指南
  • Deep-Live-Cam实时换脸终极指南:解决inswapper_128_fp16.onnx模型加载失败的完整方案
  • 巴中外贸建站推荐,WaiMaoYa 外贸鸭精准引流+高效转化,双重提升外贸业绩 - 外贸独立站运营
  • 告别U盘!用Windows Server 2019+WDS+MDT搭建企业级PXE批量装机环境(保姆级避坑)
  • OpCore Simplify深度解析:智能黑苹果EFI配置的完整解决方案
  • VEX机器人高速颜色分选机构设计:从气动活板门到毫秒级响应