zl程序教程

您现在的位置是:首页 >  云平台

当前栏目

基于 hugging face 预训练模型的实体识别智能标注方案:生成doccano要求json格式

识别训练JSON智能JSON 基于 生成 模型
2023-06-13 09:16:07 时间

强烈推荐:数据标注平台doccano----简介、安装、使用、踩坑记录_汀、的博客-CSDN博客_doccano

huggingface官网

参考:数据标注平台doccano----简介、安装、使用、踩坑记录

1.hugging face

相关教程直接参考别人的:与训练模型

【Huggingface Transformers】保姆级使用教程—上 - 知乎

【Huggingface Transformers】保姆级使用教程02—微调预训练模型 Fine-tuning - 知乎

huggingface transformers的trainer使用指南 - 知乎

2.doccano标注平台格式要求

doccano平台操作参考文章开头链接:

json格式导入数据格式要求: 实体;包含关系样式展示

{
    "text": "Google was founded on September 4, 1998, by Larry Page and Sergey Brin.",
    "entities": [
        {
            "id": 0,
            "start_offset": 0,
            "end_offset": 6,
            "label": "ORG"
        },
        {
            "id": 1,
            "start_offset": 22,
            "end_offset": 39,
            "label": "DATE"
        },
        {
            "id": 2,
            "start_offset": 44,
            "end_offset": 54,
            "label": "PERSON"
        },
        {
            "id": 3,
            "start_offset": 59,
            "end_offset": 70,
            "label": "PERSON"
        }
    ],
    "relations": [
        {
            "from_id": 0,
            "to_id": 1,
            "type": "foundedAt"
        },
        {
            "from_id": 0,
            "to_id": 2,
            "type": "foundedBy"
        },
        {
            "from_id": 0,
            "to_id": 3,
            "type": "foundedBy"
        }
    ]
}

3. 实体智能标注+格式转换

3.1 长文本(一个txt长篇)

注释部分包含预训练模型识别实体;以及精灵标注助手格式要求

from transformers import pipeline
import os
from tqdm import tqdm
import pandas as pd
from time import time
import json


def return_single_entity(name, start, end):
    return [int(start), int(end), name]

# def return_single_entity(name, word, start, end, id, attributes=[]):
#     entity = {}
#     entity['type'] = 'T'
#     entity['name'] = name
#     entity['value'] = word
#     entity['start'] = int(start)
#     entity['end'] = int(end)
#     entity['attributes'] = attributes
#     entity['id'] = int(id)
#     return entity


# input_dir = 'E:/datasets/myUIE/inputs'
input_dir = 'C:/Users/admin/Desktop//test_input.txt'
output_dir = 'C:/Users/admin/Desktop//outputs'

tagger = pipeline(task='ner', model='xlm-roberta-large-finetuned-conll03-english',
                  aggregation_strategy='simple')

keywords = {'PER': '人', 'ORG': '机构'}  # loc 地理位置 misc 其他类型实体

# for filename in tqdm(input_dir):
#     # 读取数据并自动打标
# json_list = []

with open(input_dir, 'r', encoding='utf8') as f:
    text = f.readlines()

json_list = [0 for i in range(len(text))]
for t in text:
    i = t.strip("\n").strip("'").strip('"')
    named_ents = tagger(i)  # 预训练模型
    # named_ents = tagger(text)
    df = pd.DataFrame(named_ents)
    """ 标注结果:entity_group     score                    word  start  end
0          ORG  0.999997  National Science Board     18   40
1          ORG  0.999997                     NSB     42   45
2          ORG  0.999997                     NSF     71   74"""
    # 放在循环里面,那每次开始新的循环就会重新定义一次,上一次定义的内容就丢了
    # json_list = [0 for i in range(len(text))]
    entity_list=[]
    # entity_list2=[]
    for index, elem in df.iterrows():
        if not elem.entity_group in keywords:
            continue
        if elem.end - elem.start <= 1:
            continue
        entity = return_single_entity(
            keywords[elem.entity_group], elem.start, elem.end)
        entity_list.append(entity)
        # entity_list2.append(entity_list)
    json_obj = {"text": text[index], "label": entity_list}
    json_list[index] = json.dumps(json_obj)
    # entity_list.append(entity)


# data = json.dumps(json_list)
# json_list.append(data)
    
with open(f'{output_dir}/data_2.json', 'w', encoding='utf8') as f:
    for line in json_list:
        f.write(line+"\n")
    # f.write('\n'.join(data))
    # f.write(str(data))
        
print('done!')








    # 转化为精灵标注助手导入格式(但是精灵标注助手的nlp标注模块有编码的问题,部分utf8字符不能正常显示,会影响标注结果)
    # id = 1
    # entity_list = ['']
    # for index, elem in df.iterrows():
    #     if not elem.entity_group in keywords:
    #         continue
    #     entity = return_single_entity(keywords[elem.entity_group], elem.word, elem.start, elem.end, id)
    #     id += 1
    #     entity_list.append(entity)

    # python_obj = {'path': f'{input_dir}/{filename}',
    #               'outputs': {'annotation': {'T': entity_list, "E": [""], "R": [""], "A": [""]}},
    #               'time_labeled': int(1000 * time()), 'labeled': True, 'content': text}
    # data = json.dumps(python_obj)
    # with open(f'{output_dir}/{filename.rstrip(".txt")}.json', 'w', encoding='utf8') as f:
    #     f.write(data)

识别结果:

{"text": "The company was founded in 1852 by Jacob Estey\n", "label": [[35, 46, "\u4eba"]]}
{"text": "The company was founded in 1852 by Jacob Estey, who bought out another Brattleboro manufacturing business.", "label": [[35, 46, "\u4eba"], [71, 82, "\u673a\u6784"]]}

可以看到label标签是乱码的,不用在意导入到doccano平台后会显示正常

3.2  短文本多个(txt文件)

from transformers import pipeline
import os
from tqdm import tqdm
import pandas as pd
import json


def return_single_entity(name, start, end):
    return [int(start), int(end), name]


input_dir = 'C:/Users/admin/Desktop/inputs_test'
output_dir = 'C:/Users/admin/Desktop//outputs'

tagger = pipeline(task='ner', model='xlm-roberta-large-finetuned-conll03-english', aggregation_strategy='simple')

json_list = []
keywords = {'PER': '人', 'ORG': '机构'}
for filename in tqdm(os.listdir(input_dir)[:3]):
    # 读取数据并自动打标
    with open(f'{input_dir}/{filename}', 'r', encoding='utf8') as f:
        text = f.read()
    named_ents = tagger(text)
    df = pd.DataFrame(named_ents)
    # 转化为doccano的导入格式
    entity_list = []
    for index, elem in df.iterrows():
        if not elem.entity_group in keywords:
            continue
        if elem.end - elem.start <= 1:
            continue
        entity = return_single_entity(keywords[elem.entity_group], elem.start, elem.end)
        entity_list.append(entity)
    file_obj = {'text': text, 'label': entity_list}
    json_obj = json.dumps(file_obj)
    json_list.append(json_obj)

with open(f'{output_dir}/data3.json', 'w', encoding='utf8') as f:
    f.write('\n'.join(json_list))

print('done!')

3.3 含标注精灵格式要求转换

from transformers import pipeline
import os
from tqdm import tqdm
import pandas as pd
from time import time
import json


def return_single_entity(name, word, start, end, id, attributes=[]):
    entity = {}
    entity['type'] = 'T'
    entity['name'] = name
    entity['value'] = word
    entity['start'] = int(start)
    entity['end'] = int(end)
    entity['attributes'] = attributes
    entity['id'] = int(id)
    return entity


input_dir = 'E:/datasets/myUIE/inputs'
output_dir = 'E:/datasets/myUIE/outputs'

tagger = pipeline(task='ner', model='xlm-roberta-large-finetuned-conll03-english', aggregation_strategy='simple')

keywords = {'PER': '人', 'ORG': '机构'}
for filename in tqdm(os.listdir(input_dir)):
    # 读取数据并自动打标
    with open(f'{input_dir}/{filename}', 'r', encoding='utf8') as f:
        text = f.read()
    named_ents = tagger(text)
    df = pd.DataFrame(named_ents)
    # 转化为精灵标注助手导入格式(但是精灵标注助手的nlp标注模块有编码的问题,部分utf8字符不能正常显示,会影响标注结果)
    id = 1
    entity_list = ['']
    for index, elem in df.iterrows():
        if not elem.entity_group in keywords:
            continue
        entity = return_single_entity(keywords[elem.entity_group], elem.word, elem.start, elem.end, id)
        id += 1
        entity_list.append(entity)

    python_obj = {'path': f'{input_dir}/{filename}',
                  'outputs': {'annotation': {'T': entity_list, "E": [""], "R": [""], "A": [""]}},
                  'time_labeled': int(1000 * time()), 'labeled': True, 'content': text}
    data = json.dumps(python_obj)
    with open(f'{output_dir}/{filename.rstrip(".txt")}.json', 'w', encoding='utf8') as f:
        f.write(data)

print('done!')

4.提高标注质量

4.1.人工复核

不多说就是一条一条检查过去,智能标注后已经省事很多了

对已标注数据进行

4.2 删除无效标注

import json

dir_path = r'C:/Users/admin/Desktop/光合项目/自动标注'  # 这里改文件地址

with open(f'{dir_path}/pre_data.jsonl', 'r',encoding='utf8')as f:  # 文件命名
    text = f.readlines()

content = [json.loads(elem.strip('\n')) for elem in text]
content = [json.dumps(cont) for cont in content if cont['entities'] != []]

with open(f'{dir_path}/remove_empty_data.jsonl', 'w',encoding='utf8')as f:  # 文件命名
    f.write('\n'.join(content))
    
print("输出数据")