Extracting structured knowledge from scientific text remains a challenging task for machine learning models. Here, we present a simple approach to joint named entity recognition and relation extraction and demonstrate how pretrained large language models (GPT-3, Llama-2) can be fine-tuned to extract useful records of complex scientific knowledge. We test three representative tasks in materials chemistry: linking dopants and host materials, cataloging metal-organic frameworks, and general composition/phase/morphology/application information extraction. Records are extracted from single sentences or entire paragraphs, and the output can be returned as simple English sentences or a more structured format such as a list of JSON objects. This approach represents a simple, accessible, and highly flexible route to obtaining large databases of structured specialized scientific knowledge extracted from research papers.
Описание
Structured information extraction from scientific text with large language models | Nature Communications
%0 Journal Article
%1 Dagdelen2024
%A Dagdelen, John
%A Dunn, Alexander
%A Lee, Sanghoon
%A Walker, Nicholas
%A Rosen, Andrew S.
%A Ceder, Gerbrand
%A Persson, Kristin A.
%A Jain, Anubhav
%D 2024
%J Nature Communications
%K extraction llm
%N 1
%P 1418
%R 10.1038/s41467-024-45563-x
%T Structured information extraction from scientific text with large language models
%U https://doi.org/10.1038/s41467-024-45563-x
%V 15
%X Extracting structured knowledge from scientific text remains a challenging task for machine learning models. Here, we present a simple approach to joint named entity recognition and relation extraction and demonstrate how pretrained large language models (GPT-3, Llama-2) can be fine-tuned to extract useful records of complex scientific knowledge. We test three representative tasks in materials chemistry: linking dopants and host materials, cataloging metal-organic frameworks, and general composition/phase/morphology/application information extraction. Records are extracted from single sentences or entire paragraphs, and the output can be returned as simple English sentences or a more structured format such as a list of JSON objects. This approach represents a simple, accessible, and highly flexible route to obtaining large databases of structured specialized scientific knowledge extracted from research papers.
@article{Dagdelen2024,
abstract = {Extracting structured knowledge from scientific text remains a challenging task for machine learning models. Here, we present a simple approach to joint named entity recognition and relation extraction and demonstrate how pretrained large language models (GPT-3, Llama-2) can be fine-tuned to extract useful records of complex scientific knowledge. We test three representative tasks in materials chemistry: linking dopants and host materials, cataloging metal-organic frameworks, and general composition/phase/morphology/application information extraction. Records are extracted from single sentences or entire paragraphs, and the output can be returned as simple English sentences or a more structured format such as a list of JSON objects. This approach represents a simple, accessible, and highly flexible route to obtaining large databases of structured specialized scientific knowledge extracted from research papers.},
added-at = {2024-03-12T14:45:28.000+0100},
author = {Dagdelen, John and Dunn, Alexander and Lee, Sanghoon and Walker, Nicholas and Rosen, Andrew S. and Ceder, Gerbrand and Persson, Kristin A. and Jain, Anubhav},
biburl = {https://www.bibsonomy.org/bibtex/2225a58ffb93eeb3a6761fac24c50ce69/mho},
day = 15,
description = {Structured information extraction from scientific text with large language models | Nature Communications},
doi = {10.1038/s41467-024-45563-x},
interhash = {de2f26a3b10f362ac45d9414b59c0304},
intrahash = {225a58ffb93eeb3a6761fac24c50ce69},
issn = {2041-1723},
journal = {Nature Communications},
keywords = {extraction llm},
month = feb,
number = 1,
pages = 1418,
timestamp = {2024-03-12T14:45:28.000+0100},
title = {Structured information extraction from scientific text with large language models},
url = {https://doi.org/10.1038/s41467-024-45563-x},
volume = 15,
year = 2024
}