class NERDataProcessor:
"""
中文NER预处理器:字符级BIO标注与实体恢复
"""
def __init__(self, entity_types: list = ["PER", "ORG", "LOC"]):
self.entity_types = entity_types
def text_to_bios(self, text: str, entities: list) -> tuple:
"""
文本转字符级BIO标签
entities格式:[(start_char, end_char, label), ...]
"""
char_labels = ["O"] * len(text)
# 按起始位置排序实体避免覆盖
for s, e, l in sorted(entities, key=lambda x: x[0]):
if 0 <= s < e <= len(text) and l in self.entity_types:
char_labels[s] = f"B-{l}"
for i in range(s+1, e):
char_labels[i] = f"I-{l}"
return list(text), char_labels
def bios_to_entities(self, chars: list, labels: list) -> list:
"""
字符级BIO标签恢复完整实体
返回格式:[(实体文本, 标签, 起始位置, 结束位置), ...]
"""
entities, curr = [], None
for i, (c, l) in enumerate(zip(chars, labels)):
if l.startswith("B-"):
if curr: entities.append(("".join(curr[0]), curr[1], curr[2], i))
curr = ([c], l[2:], i)
elif l.startswith("I-") and curr and curr[1] == l[2:]:
curr[0].append(c)
else:
if curr: entities.append(("".join(curr[0]), curr[1], curr[2], i))
curr = None
if curr: entities.append(("".join(curr[0]), curr[1], curr[2], len(chars)))
return entities