跳到主要内容

02. 语义分块

在简单 RAG 中,我们使用了固定大小的文本分块。但这种方法可能会破坏句子或概念的完整性。语义分块通过分析文本的语义相似度来智能地确定分块边界。

学习目标

  • 理解固定分块的局限性
  • 学习语义分块的原理和实现
  • 掌握余弦相似度在分块中的应用
  • 比较不同分块策略的效果

问题分析

固定分块的问题

# 固定分块可能导致的问题
text = "人工智能是计算机科学的一个分支。它致力于理解智能的实质。并生产出一种新的能以人类智能相似的方式做出反应的智能机器。"

# 固定分块可能会这样分割:
# 块1: "人工智能是计算机科学的一个分支。它致力于理解智能的"
# 块2: "实质。并生产出一种新的能以人类智能相似的方式做"
# 块3: "出反应的智能机器。"

问题:

  • 句子被强制分割
  • 概念的完整性被破坏
  • 影响检索质量

语义分块原理

语义分块的核心思想是:相邻的语义相似的句子应该保持在同一个块中,而语义差异较大的句子之间应该设立分块边界

工作流程

graph TD
A[原始文本] --> B[按句子分割]
B --> C[计算句子嵌入]
C --> D[计算相邻句子相似度]
D --> E[设置相似度阈值]
E --> F[基于阈值确定分块边界]
F --> G[生成语义分块]

实现方案

1. 句子分割

sentence_splitting.py
def split_into_sentences(text):
"""
将文本分割为句子列表
"""
# 简单的句子分割(基于句号)
sentences = text.split(".")

# 清理空句子和添加句号
sentences = [s.strip() + "." for s in sentences if s.strip()]

return sentences

# 示例
text = "人工智能是计算机科学的分支。它研究智能的本质。AI可以模拟人类思维。"
sentences = split_into_sentences(text)
print("句子列表:", sentences)

2. 计算句子嵌入

sentence_embeddings.py
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import numpy as np

def create_sentence_embeddings(sentences, model="BAAI/bge-base-en-v1.5"):
"""
为每个句子创建嵌入向量
"""
embedding_model = HuggingFaceEmbedding(model_name=model)
embeddings = embedding_model.get_text_embedding_batch(sentences)

return np.array(embeddings)

# 创建句子嵌入
sentences = ["AI is a branch of computer science.",
"It aims to create intelligent machines.",
"Machine learning is a subset of AI."]

embeddings = create_sentence_embeddings(sentences)
print(f"嵌入矩阵形状: {embeddings.shape}")

3. 计算语义相似度

semantic_similarity.py
def cosine_similarity(vec1, vec2):
"""
计算两个向量的余弦相似度
"""
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def calculate_similarities(embeddings):
"""
计算相邻句子间的相似度
"""
similarities = []

for i in range(len(embeddings) - 1):
similarity = cosine_similarity(embeddings[i], embeddings[i + 1])
similarities.append(similarity)

return similarities

# 计算相邻句子相似度
similarities = calculate_similarities(embeddings)
print("相邻句子相似度:", similarities)

4. 确定分块边界

chunking_boundaries.py
def find_chunk_boundaries(similarities, threshold=0.5):
"""
基于相似度阈值确定分块边界

Args:
similarities: 相邻句子相似度列表
threshold: 相似度阈值,低于此值则分块

Returns:
boundaries: 分块边界位置列表
"""
boundaries = [0] # 第一个边界总是0

for i, similarity in enumerate(similarities):
if similarity < threshold:
boundaries.append(i + 1) # 在相似度低的地方设置边界

boundaries.append(len(similarities) + 1) # 最后一个边界

return boundaries

# 确定分块边界
boundaries = find_chunk_boundaries(similarities, threshold=0.6)
print("分块边界:", boundaries)

5. 生成语义分块

semantic_chunks.py
def create_semantic_chunks(sentences, boundaries):
"""
根据边界创建语义分块
"""
chunks = []

for i in range(len(boundaries) - 1):
start = boundaries[i]
end = boundaries[i + 1]

# 合并句子为一个块
chunk = " ".join(sentences[start:end])
chunks.append(chunk)

return chunks

# 创建语义分块
semantic_chunks = create_semantic_chunks(sentences, boundaries)

print("语义分块结果:")
for i, chunk in enumerate(semantic_chunks):
print(f"块 {i+1}: {chunk}")

完整实现

semantic_chunking_complete.py
import fitz
import numpy as np
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

class SemanticChunker:
def __init__(self, model_name="BAAI/bge-base-en-v1.5", similarity_threshold=0.5):
self.embedding_model = HuggingFaceEmbedding(model_name=model_name)
self.similarity_threshold = similarity_threshold

def extract_text_from_pdf(self, pdf_path):
"""从PDF提取文本"""
mypdf = fitz.open(pdf_path)
all_text = ""
for page_num in range(mypdf.page_count):
page = mypdf[page_num]
all_text += page.get_text("text") + " "
return all_text.strip()

def split_into_sentences(self, text):
"""分割句子"""
sentences = text.split(".")
sentences = [s.strip() + "." for s in sentences if s.strip()]
return sentences

def cosine_similarity(self, vec1, vec2):
"""计算余弦相似度"""
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def calculate_similarities(self, embeddings):
"""计算相邻句子相似度"""
similarities = []
for i in range(len(embeddings) - 1):
similarity = self.cosine_similarity(embeddings[i], embeddings[i + 1])
similarities.append(similarity)
return similarities

def find_boundaries(self, similarities):
"""确定分块边界"""
boundaries = [0]

for i, similarity in enumerate(similarities):
if similarity < self.similarity_threshold:
boundaries.append(i + 1)

boundaries.append(len(similarities) + 1)
return boundaries

def chunk_text(self, text):
"""语义分块主函数"""
# 1. 分割句子
sentences = self.split_into_sentences(text)
print(f"分割了 {len(sentences)} 个句子")

# 2. 创建句子嵌入
embeddings = self.embedding_model.get_text_embedding_batch(sentences)
embeddings = np.array(embeddings)

# 3. 计算相似度
similarities = self.calculate_similarities(embeddings)

# 4. 确定边界
boundaries = self.find_boundaries(similarities)

# 5. 创建分块
chunks = []
for i in range(len(boundaries) - 1):
start = boundaries[i]
end = boundaries[i + 1]
chunk = " ".join(sentences[start:end])
chunks.append(chunk)

return chunks

def process_document(self, pdf_path):
"""处理整个文档"""
# 提取文本
text = self.extract_text_from_pdf(pdf_path)

# 语义分块
chunks = self.chunk_text(text)

print(f"创建了 {len(chunks)} 个语义分块")
return chunks

# 使用示例
if __name__ == "__main__":
# 创建语义分块器
chunker = SemanticChunker(similarity_threshold=0.6)

# 处理文档
chunks = chunker.process_document("data/AI_Information.pdf")

# 显示前3个分块
for i, chunk in enumerate(chunks[:3]):
print(f"\n=== 语义分块 {i+1} ===")
print(chunk)
print("-" * 50)

改进策略

1. 动态阈值

dynamic_threshold.py
def calculate_dynamic_threshold(similarities, percentile=25):
"""
基于相似度分布计算动态阈值

Args:
similarities: 相似度列表
percentile: 百分位数(较低的百分位对应更严格的分块)
"""
threshold = np.percentile(similarities, percentile)
return threshold

# 使用动态阈值
dynamic_threshold = calculate_dynamic_threshold(similarities, percentile=30)
print(f"动态阈值: {dynamic_threshold}")

2. 分块大小控制

chunk_size_control.py
def create_controlled_chunks(sentences, similarities,
max_chunk_size=1000, min_chunk_size=200):
"""
控制分块大小的语义分块
"""
chunks = []
current_chunk = []
current_size = 0

for i, sentence in enumerate(sentences):
current_chunk.append(sentence)
current_size += len(sentence)

# 检查是否需要分块
should_break = False

if i < len(similarities):
# 如果相似度低且当前块大小合适
if (similarities[i] < 0.5 and
current_size >= min_chunk_size):
should_break = True

# 如果块太大,强制分块
if current_size >= max_chunk_size:
should_break = True

if should_break or i == len(sentences) - 1:
chunk_text = " ".join(current_chunk)
chunks.append(chunk_text)
current_chunk = []
current_size = 0

return chunks

效果对比

评估指标

evaluation.py
def evaluate_chunking_quality(chunks, queries, ground_truth):
"""
评估分块质量

指标:
1. 检索准确率
2. 平均分块大小
3. 大小方差
"""
# 创建嵌入
chunk_embeddings = create_embeddings(chunks)

# 检索测试
correct_retrievals = 0
total_queries = len(queries)

for query, truth in zip(queries, ground_truth):
retrieved = semantic_search(query, chunks, chunk_embeddings, k=1)
if truth in retrieved[0]: # 简化的评估
correct_retrievals += 1

accuracy = correct_retrievals / total_queries

# 分块统计
chunk_sizes = [len(chunk) for chunk in chunks]
avg_size = np.mean(chunk_sizes)
size_variance = np.var(chunk_sizes)

return {
'accuracy': accuracy,
'avg_chunk_size': avg_size,
'size_variance': size_variance,
'num_chunks': len(chunks)
}

最佳实践

1. 阈值选择

  • 高阈值 (0.7-0.9): 更少的分块,保持更多上下文
  • 中等阈值 (0.5-0.7): 平衡分块数量和语义连贯性
  • 低阈值 (0.3-0.5): 更多细粒度分块

2. 句子分割改进

import re

def advanced_sentence_split(text):
"""
改进的句子分割
"""
# 处理缩写和数字
text = re.sub(r'([A-Z][a-z]\.)', r'\1<ABBREV>', text)
text = re.sub(r'(\d+\.\d+)', r'\1<DECIMAL>', text)

# 按句子分割
sentences = re.split(r'[.!?]+', text)

# 恢复缩写和数字
sentences = [s.replace('<ABBREV>', '.').replace('<DECIMAL>', '.')
for s in sentences if s.strip()]

return sentences

总结

语义分块相比固定分块有以下优势:

  1. 保持语义完整性: 相关内容保持在同一块中
  2. 自适应分块: 根据内容特点自动调整分块大小
  3. 提高检索质量: 更好的语义边界改善检索效果

在下一章 分块大小选择器 中,我们将学习如何动态选择最优的分块大小。