Module nafparserpy.parser

Wraps lxml to facilitate handling of NAF documents

Expand source code
"""
Wraps lxml to facilitate handling of NAF documents
"""
import datetime
import re
from typing import Any, Tuple, Dict, List

from nafparserpy.layers.naf_header import NafHeader, LP, LinguisticProcessors
from nafparserpy.layers.raw import Raw
from lxml import etree

from nafparserpy.layers.factory import create_from_node, create_from_elements

NAF_VERSION = '3.3'


def split_naf_header_attrs(attrs):
    """Split input attributes in public or fileDesc attributes

    Parameters
    ----------
    attrs : dict
        dictionary of public/fileDesc attributes

    Returns
    -------
    a tuple of attribute dictionaries for fileDesc and public

    Raises
    ------
    KeyError: if the input dictionary contains keywords not pertaining to public/fileDesc attributes
        """
    public_attrs = {}
    filedesc_attrs = {}
    public_keys = ['publicId', 'uri']
    filedesc_keys = ['title', 'author', 'creationtime', 'filename', 'filetype', 'pages']
    for k in attrs:
        if k in public_keys:
            public_attrs.update({k: attrs[k]})
        elif k in filedesc_keys:
            filedesc_attrs.update({k: attrs[k]})
        else:
            raise KeyError('unknown public/fileDesc key: {}'.format(k))
    return filedesc_attrs, public_attrs


def validate_dtd(tree, dtd='naf_v3.3.dtd'):
    """Validate tree against DTD

    Parameters
    ----------
    tree : ElementTree
        NAF tree
    dtd : str
        path to DTD

    Raises
    ------
    ValueError : if tree is not valid
    """
    with open(dtd) as infile:
        dtd = etree.DTD(infile)
    if not dtd.validate(tree.getroot()):
        raise ValueError(f"Input tree does not conform to DTD {dtd}")


def remove_lps(ling_processors_layer_node):
    lps = [child for child in ling_processors_layer_node]
    for lp in lps:
        ling_processors_layer_node.remove(lp)


class NafParser:
    def __init__(self, tree=None, lang='en', version=None, decorate=True, **attrs):
        """
        Create a NAF document from an existing tree or from scratch.

        Parameters
        ----------
        tree : etree
            input tree
        lang : str
            document language, defaults to `en`. This parameter is ignored if tree is not None
        version : str
            NAF version, defaults to `parser.NAF_VERSION`; ignored if tree is not None
        decorate : bool
            adds covered text to span nodes
        attrs : dict
            nafHeader fileDesc and public attributes; ignored if tree is not None
        """
        self.decorate = decorate
        naf_version = NAF_VERSION
        if version is not None:
            naf_version = version
        if tree is None:
            self.tree = etree.ElementTree(etree.Element('NAF'))
            self.root = self.tree.getroot()
            self.root.set('{http://www.w3.org/XML/1998/namespace}lang', lang)
            self.root.set('version', naf_version)
            if attrs:
                filedesc_attrs, public_attrs = split_naf_header_attrs(attrs)
                self.add_naf_header(fileDesc_attrs=filedesc_attrs, public_attrs=public_attrs)
            self.id_map = {}
        else:
            self.tree = tree
            self.root = self.tree.getroot()
            self.id_map = self.targets2indices()

    @staticmethod
    def load(naf_file, validate_against_dtd=False, decorate=True):
        """Create a NAF document from a NAF file

        Parameters
        ----------
        naf_file : str
            path to NAF file
        validate_against_dtd : bool
            validates input tree against DTD if True
        decorate : bool
            adds covered text to span nodes

        Raises
        ------
        ValueError: if `validate_against_dtd` is True, and input file does not conform to the DTD
        """
        tree = etree.parse(naf_file, etree.XMLParser(remove_blank_text=True, strip_cdata=False))

        if validate_against_dtd:
            validate_dtd(tree)

        return NafParser(tree, decorate=decorate)

    def write(self, file_path):
        """Write NAF tree to file or stdout if no file path is given"""
        if file_path is None:
            print(etree.tostring(self.root, encoding='UTF-8', pretty_print=True, xml_declaration=True))
        else:
            self.tree.write(file_path, encoding='UTF-8', pretty_print=True, xml_declaration=True)

    def has_layer(self, layer: str):
        """Returns True if layer with given name exists"""
        return self.root.findall('.//{}'.format(layer))

    def get(self, layer_name: str):
        """Return a layer object for the layer with the given layer-name.

        Returns only the first object if more elements carry the same name."""
        if not self.has_layer(layer_name):
            raise ValueError("layer {} does not exist".format(layer_name))
        nodes = self.root.findall('.//{}'.format(layer_name))
        return create_from_node[layer_name](nodes[0])

    def getall(self, layer_name: str):
        """Return a list of layer objects for each layer carrying the given layer-name
        """
        if not self.has_layer(layer_name):
            raise ValueError("layer {} does not exist".format(layer_name))
        nodes = self.root.findall('.//{}'.format(layer_name))
        return [create_from_node[layer_name](node) for node in nodes]

    def add_layer(self, layer_name: str, element: Any, exist_ok=False):
        """Add a layer to the NAF xml tree

        Parameters
        ----------
        layer_name : str
            naf layer name
        element : Any
            layer object
        exist_ok : bool
            allows replacement of existing layer

        Raises
        ------
        ValueError: if layer already exists and `exist_ok` is False
        """
        if self.has_layer(layer_name) and not exist_ok:
            raise ValueError('Layer {} already exists'.format(layer_name))
        else:
            if self.has_layer(layer_name):
                self.root.remove(self.root.find(layer_name))
            self.root.append(element.node())
            if layer_name in ('text', 'terms'):
                self.reset_targets2indices()
            if self.decorate:
                self.add_comments()

    def add_layer_from_elements(self, layer_name: str, elements: list, exist_ok=False):
        """Create container layer from its elements.

        This method can be applied to non-empty layers without attributes. This concerns almost all layers,
        except for `NafHeader`, `Raw` and `TemporalRelations`

        Parameters
        ----------
        layer_name : str
            naf layer name
        elements : list
            list of layer elements objects
        exist_ok : bool
            allows replacement of existing layer

        Raises
        ------
        ValueError: if layer already exists and `exist_ok` is False
        """
        self.add_layer(layer_name,
                       create_from_elements[layer_name](elements),
                       exist_ok=exist_ok)

    def add_naf_header(self, fileDesc_attrs={}, public_attrs={}, linguistic_processors=[], exist_ok=False):
        """
        Create and add `nafHeader` layer

        Parameters
        ----------
        fileDesc_attrs : dict
            `fileDesc` layer attributes
        public_attrs : dict
            `public` layer attributes
        linguistic_processors : list[LinguisticProcessors]
            list of `LinguisticProcessors` objects per layer
        exist_ok : bool
            allows replacement of existing layer
        """
        self.add_layer('nafHeader', NafHeader.create(fileDesc_attrs, public_attrs, linguistic_processors), exist_ok)

    def add_linguistic_processor(self, layer: str, name: str, version: str, lpDependencies=[], attributes={},
                                 add_time_stamp=True, replace=False):
        """Add a `linguistic processor` element to the linguistic processors list for the given layer.

        Creates a `nafHeader` layer and/or a `linguisticProcessors` layer if there is not one yet.

        Parameters
        ----------
        layer : str
            the name of the layer
        name : str
            the name of the linguistic processor
        version : str
            the version of the linguistic processor
        lpDependencies : List(LPDependency)
            list of linguistic processor dependencies
        attributes : dict
            optional linguistic processor attributes ('timestamp', 'beginTimestamp', 'endTimestamp', 'hostname')
        add_time_stamp : bool
            create time stamp
        replace : bool
            replace or append to `lp` elements for that layer
        """
        if not self.has_layer('nafHeader'):
            self.add_naf_header()
        if add_time_stamp:
            attributes['timestamp'] = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec='seconds')
        self.add_lp(layer, LP(name, version, lpDependencies, attributes), replace)

    def add_lp(self, layer: str, linguistic_processor: LP, replace: bool):
        """Add a linguistic processor element to the linguistic processors list for the given layer.

        Creates a `linguisticProcessors` layer if there is not one yet. Pre-existing linguistic processor elements are
        replaced if `replace` is True.

        Parameters
        ----------
        layer : str
            the name of the layer
        linguistic_processor : LP
            the linguistic processor
        replace : bool
            replace or append to `lp` elements for that layer
        """
        naf_header_node = self.root.find('nafHeader')
        ling_processors_layer_nodes = [lps for lps in naf_header_node.findall('linguisticProcessors')
                                       if lps.get('layer') == layer]
        if not ling_processors_layer_nodes:
            ling_processors_layer_node = LinguisticProcessors(layer, [linguistic_processor]).node()
            naf_header_node.append(ling_processors_layer_node)
        elif replace:
            remove_lps(ling_processors_layer_nodes[0])
            ling_processors_layer_nodes[0].append(linguistic_processor.node())
        else:
            ling_processors_layer_nodes[0].append(linguistic_processor.node())

    def extend_lps(self, layer: str, linguistic_processors: List[LP], replace=False):
        """Add linguistic processor elements to the linguistic processors list for the given layer.

        Creates a `linguisticProcessors` layer if there is not one yet.

        Parameters
        ----------
        layer : str
            the name of the layer
        linguistic_processors : List[LP]
            the linguistic processors
        """
        naf_header_node = self.root.find('nafHeader')
        ling_processors_layer_nodes = [lps for lps in naf_header_node.findall('linguisticProcessors')
                                       if lps.get('layer') == layer]
        if not ling_processors_layer_nodes:
            ling_processors_layer_node = LinguisticProcessors(layer, linguistic_processors).node()
            naf_header_node.append(ling_processors_layer_node)
        elif replace:
            remove_lps(ling_processors_layer_nodes[0])
            ling_processors_layer_nodes[0] = [lp.node() for lp in linguistic_processors]
        else:
            ling_processors_layer_nodes[0].extend([lp.node() for lp in linguistic_processors])

    def add_raw_layer(self, text: str, exist_ok=False):
        """Add (or replace) raw layer from text

        Parameters
        ----------
        text : str
            raw layer text
        exist_ok : bool
            allows replacement of existing layer"""
        self.add_layer('raw', Raw(text), exist_ok)

    def get_lps(self, layer_name):
        """Return list of linguistic processors for a given layer

        Parameters
        ----------
        layer_name: str
            layer name

        Returns
        -------
        list of Lp objects

        Raises
        ------
        ValueError: if the NAF header has no linguisticProcessors element for that layer"""

        lprocessors = [x for x in self.getall('linguisticProcessors') if x.layer_name == layer_name]
        if lprocessors:
            return lprocessors[0].lps
        else:
            return None

    def targets2indices(self) -> Dict[str, Tuple[int, int]]:
        """Map each word form, subtoken or term id to its begin and end indices

        Returns
        -------
        map of target ids to start and end indices
        """
        if not self.has_layer('text'):
            return {}
        id_map = {}
        for wf in self.get('text'):
            id_map[wf.id] = (int(wf.offset), int(wf.offset) + int(wf.length))
            if wf.subtokens:
                id_map.update({st.id: (int(st.offset), int(st.offset) + int(st.length)) for st in wf.subtokens})
        if self.has_layer('terms'):     # higher layers may reference to terms
            # map term ids to begin/end indices through word-form ids
            twf_map = {t.id: id_map[t.span.target_ids()[0]] for t in self.get('terms')}
            id_map.update(twf_map)
        return id_map

    def add_comments(self):
        """Add covered text as comment to all Span elements that have no comment yet"""
        spans = [x for x in self.root.findall('.//span') if not [_ for _ in x.iter(tag=etree.Comment)]]
        target_ids = [[t.get('id') for t in span.findall('target')] for span in spans]
        if spans and not self.id_map:
            self.id_map = self.targets2indices()
        for span_node, tid_span in zip(spans, target_ids):
            begin, end = self.id_map[tid_span[0]][0], self.id_map[tid_span[-1]][1]
            comment = self.get('raw').text[begin:end]
            comment = comment.replace('--', '-~')
            comment = re.sub('-$', '~', comment)
            span_node.append(etree.Comment(comment))

    def covered_text(self, target_ids: List[str]) -> str:
        """Return text covered by the target ids

        Parameters
        ----------
        target_ids: List[str]
            target ids

        Returns
        -------
        covered text
        """
        start, end = self.start_end_indices(target_ids)
        return self.get('raw').text[start:end]

    def start_end_indices(self, target_ids: List[str]) -> Tuple[int, int]:
        """Return the start and end indices of the span represented by the target ids

        Parameters
        ----------
        target_ids: List[str]
            target ids

        Returns
        -------
        tuple of start and end indices
        """
        if not self.id_map:
            self.id_map = self.targets2indices()
            if not self.id_map:
                raise ValueError('No target ids found')
        return self.id_map[target_ids[0]][0], self.id_map[target_ids[-1]][1]

    def reset_targets2indices(self):
        """Recomputes the mapping of all word forms, subtokens and terms to their start and end indices.

        This mapping is computed in a restricted number of cases: when loading a existing NAF document, or when
        retrieving the covered text on a newly created NAF document. The present function can be called when
        adding layers for which the mapping will be relevant, such as subtokens or terms on a NAF document already
        annotated with word forms.
        """
        self.id_map = self.targets2indices()

Functions

def remove_lps(ling_processors_layer_node)
Expand source code
def remove_lps(ling_processors_layer_node):
    lps = [child for child in ling_processors_layer_node]
    for lp in lps:
        ling_processors_layer_node.remove(lp)
def split_naf_header_attrs(attrs)

Split input attributes in public or fileDesc attributes

Parameters

attrs : dict
dictionary of public/fileDesc attributes

Returns

a tuple of attribute dictionaries for fileDesc and public
 

Raises

KeyError : if the input dictionary contains keywords not pertaining to public/fileDesc attributes
 
Expand source code
def split_naf_header_attrs(attrs):
    """Split input attributes in public or fileDesc attributes

    Parameters
    ----------
    attrs : dict
        dictionary of public/fileDesc attributes

    Returns
    -------
    a tuple of attribute dictionaries for fileDesc and public

    Raises
    ------
    KeyError: if the input dictionary contains keywords not pertaining to public/fileDesc attributes
        """
    public_attrs = {}
    filedesc_attrs = {}
    public_keys = ['publicId', 'uri']
    filedesc_keys = ['title', 'author', 'creationtime', 'filename', 'filetype', 'pages']
    for k in attrs:
        if k in public_keys:
            public_attrs.update({k: attrs[k]})
        elif k in filedesc_keys:
            filedesc_attrs.update({k: attrs[k]})
        else:
            raise KeyError('unknown public/fileDesc key: {}'.format(k))
    return filedesc_attrs, public_attrs
def validate_dtd(tree, dtd='naf_v3.3.dtd')

Validate tree against DTD

Parameters

tree : ElementTree
NAF tree
dtd : str
path to DTD

Raises

ValueError : if tree is not valid
 
Expand source code
def validate_dtd(tree, dtd='naf_v3.3.dtd'):
    """Validate tree against DTD

    Parameters
    ----------
    tree : ElementTree
        NAF tree
    dtd : str
        path to DTD

    Raises
    ------
    ValueError : if tree is not valid
    """
    with open(dtd) as infile:
        dtd = etree.DTD(infile)
    if not dtd.validate(tree.getroot()):
        raise ValueError(f"Input tree does not conform to DTD {dtd}")

Classes

class NafParser (tree=None, lang='en', version=None, decorate=True, **attrs)

Create a NAF document from an existing tree or from scratch.

Parameters

tree : etree
input tree
lang : str
document language, defaults to en. This parameter is ignored if tree is not None
version : str
NAF version, defaults to parser.NAF_VERSION; ignored if tree is not None
decorate : bool
adds covered text to span nodes
attrs : dict
nafHeader fileDesc and public attributes; ignored if tree is not None
Expand source code
class NafParser:
    def __init__(self, tree=None, lang='en', version=None, decorate=True, **attrs):
        """
        Create a NAF document from an existing tree or from scratch.

        Parameters
        ----------
        tree : etree
            input tree
        lang : str
            document language, defaults to `en`. This parameter is ignored if tree is not None
        version : str
            NAF version, defaults to `parser.NAF_VERSION`; ignored if tree is not None
        decorate : bool
            adds covered text to span nodes
        attrs : dict
            nafHeader fileDesc and public attributes; ignored if tree is not None
        """
        self.decorate = decorate
        naf_version = NAF_VERSION
        if version is not None:
            naf_version = version
        if tree is None:
            self.tree = etree.ElementTree(etree.Element('NAF'))
            self.root = self.tree.getroot()
            self.root.set('{http://www.w3.org/XML/1998/namespace}lang', lang)
            self.root.set('version', naf_version)
            if attrs:
                filedesc_attrs, public_attrs = split_naf_header_attrs(attrs)
                self.add_naf_header(fileDesc_attrs=filedesc_attrs, public_attrs=public_attrs)
            self.id_map = {}
        else:
            self.tree = tree
            self.root = self.tree.getroot()
            self.id_map = self.targets2indices()

    @staticmethod
    def load(naf_file, validate_against_dtd=False, decorate=True):
        """Create a NAF document from a NAF file

        Parameters
        ----------
        naf_file : str
            path to NAF file
        validate_against_dtd : bool
            validates input tree against DTD if True
        decorate : bool
            adds covered text to span nodes

        Raises
        ------
        ValueError: if `validate_against_dtd` is True, and input file does not conform to the DTD
        """
        tree = etree.parse(naf_file, etree.XMLParser(remove_blank_text=True, strip_cdata=False))

        if validate_against_dtd:
            validate_dtd(tree)

        return NafParser(tree, decorate=decorate)

    def write(self, file_path):
        """Write NAF tree to file or stdout if no file path is given"""
        if file_path is None:
            print(etree.tostring(self.root, encoding='UTF-8', pretty_print=True, xml_declaration=True))
        else:
            self.tree.write(file_path, encoding='UTF-8', pretty_print=True, xml_declaration=True)

    def has_layer(self, layer: str):
        """Returns True if layer with given name exists"""
        return self.root.findall('.//{}'.format(layer))

    def get(self, layer_name: str):
        """Return a layer object for the layer with the given layer-name.

        Returns only the first object if more elements carry the same name."""
        if not self.has_layer(layer_name):
            raise ValueError("layer {} does not exist".format(layer_name))
        nodes = self.root.findall('.//{}'.format(layer_name))
        return create_from_node[layer_name](nodes[0])

    def getall(self, layer_name: str):
        """Return a list of layer objects for each layer carrying the given layer-name
        """
        if not self.has_layer(layer_name):
            raise ValueError("layer {} does not exist".format(layer_name))
        nodes = self.root.findall('.//{}'.format(layer_name))
        return [create_from_node[layer_name](node) for node in nodes]

    def add_layer(self, layer_name: str, element: Any, exist_ok=False):
        """Add a layer to the NAF xml tree

        Parameters
        ----------
        layer_name : str
            naf layer name
        element : Any
            layer object
        exist_ok : bool
            allows replacement of existing layer

        Raises
        ------
        ValueError: if layer already exists and `exist_ok` is False
        """
        if self.has_layer(layer_name) and not exist_ok:
            raise ValueError('Layer {} already exists'.format(layer_name))
        else:
            if self.has_layer(layer_name):
                self.root.remove(self.root.find(layer_name))
            self.root.append(element.node())
            if layer_name in ('text', 'terms'):
                self.reset_targets2indices()
            if self.decorate:
                self.add_comments()

    def add_layer_from_elements(self, layer_name: str, elements: list, exist_ok=False):
        """Create container layer from its elements.

        This method can be applied to non-empty layers without attributes. This concerns almost all layers,
        except for `NafHeader`, `Raw` and `TemporalRelations`

        Parameters
        ----------
        layer_name : str
            naf layer name
        elements : list
            list of layer elements objects
        exist_ok : bool
            allows replacement of existing layer

        Raises
        ------
        ValueError: if layer already exists and `exist_ok` is False
        """
        self.add_layer(layer_name,
                       create_from_elements[layer_name](elements),
                       exist_ok=exist_ok)

    def add_naf_header(self, fileDesc_attrs={}, public_attrs={}, linguistic_processors=[], exist_ok=False):
        """
        Create and add `nafHeader` layer

        Parameters
        ----------
        fileDesc_attrs : dict
            `fileDesc` layer attributes
        public_attrs : dict
            `public` layer attributes
        linguistic_processors : list[LinguisticProcessors]
            list of `LinguisticProcessors` objects per layer
        exist_ok : bool
            allows replacement of existing layer
        """
        self.add_layer('nafHeader', NafHeader.create(fileDesc_attrs, public_attrs, linguistic_processors), exist_ok)

    def add_linguistic_processor(self, layer: str, name: str, version: str, lpDependencies=[], attributes={},
                                 add_time_stamp=True, replace=False):
        """Add a `linguistic processor` element to the linguistic processors list for the given layer.

        Creates a `nafHeader` layer and/or a `linguisticProcessors` layer if there is not one yet.

        Parameters
        ----------
        layer : str
            the name of the layer
        name : str
            the name of the linguistic processor
        version : str
            the version of the linguistic processor
        lpDependencies : List(LPDependency)
            list of linguistic processor dependencies
        attributes : dict
            optional linguistic processor attributes ('timestamp', 'beginTimestamp', 'endTimestamp', 'hostname')
        add_time_stamp : bool
            create time stamp
        replace : bool
            replace or append to `lp` elements for that layer
        """
        if not self.has_layer('nafHeader'):
            self.add_naf_header()
        if add_time_stamp:
            attributes['timestamp'] = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec='seconds')
        self.add_lp(layer, LP(name, version, lpDependencies, attributes), replace)

    def add_lp(self, layer: str, linguistic_processor: LP, replace: bool):
        """Add a linguistic processor element to the linguistic processors list for the given layer.

        Creates a `linguisticProcessors` layer if there is not one yet. Pre-existing linguistic processor elements are
        replaced if `replace` is True.

        Parameters
        ----------
        layer : str
            the name of the layer
        linguistic_processor : LP
            the linguistic processor
        replace : bool
            replace or append to `lp` elements for that layer
        """
        naf_header_node = self.root.find('nafHeader')
        ling_processors_layer_nodes = [lps for lps in naf_header_node.findall('linguisticProcessors')
                                       if lps.get('layer') == layer]
        if not ling_processors_layer_nodes:
            ling_processors_layer_node = LinguisticProcessors(layer, [linguistic_processor]).node()
            naf_header_node.append(ling_processors_layer_node)
        elif replace:
            remove_lps(ling_processors_layer_nodes[0])
            ling_processors_layer_nodes[0].append(linguistic_processor.node())
        else:
            ling_processors_layer_nodes[0].append(linguistic_processor.node())

    def extend_lps(self, layer: str, linguistic_processors: List[LP], replace=False):
        """Add linguistic processor elements to the linguistic processors list for the given layer.

        Creates a `linguisticProcessors` layer if there is not one yet.

        Parameters
        ----------
        layer : str
            the name of the layer
        linguistic_processors : List[LP]
            the linguistic processors
        """
        naf_header_node = self.root.find('nafHeader')
        ling_processors_layer_nodes = [lps for lps in naf_header_node.findall('linguisticProcessors')
                                       if lps.get('layer') == layer]
        if not ling_processors_layer_nodes:
            ling_processors_layer_node = LinguisticProcessors(layer, linguistic_processors).node()
            naf_header_node.append(ling_processors_layer_node)
        elif replace:
            remove_lps(ling_processors_layer_nodes[0])
            ling_processors_layer_nodes[0] = [lp.node() for lp in linguistic_processors]
        else:
            ling_processors_layer_nodes[0].extend([lp.node() for lp in linguistic_processors])

    def add_raw_layer(self, text: str, exist_ok=False):
        """Add (or replace) raw layer from text

        Parameters
        ----------
        text : str
            raw layer text
        exist_ok : bool
            allows replacement of existing layer"""
        self.add_layer('raw', Raw(text), exist_ok)

    def get_lps(self, layer_name):
        """Return list of linguistic processors for a given layer

        Parameters
        ----------
        layer_name: str
            layer name

        Returns
        -------
        list of Lp objects

        Raises
        ------
        ValueError: if the NAF header has no linguisticProcessors element for that layer"""

        lprocessors = [x for x in self.getall('linguisticProcessors') if x.layer_name == layer_name]
        if lprocessors:
            return lprocessors[0].lps
        else:
            return None

    def targets2indices(self) -> Dict[str, Tuple[int, int]]:
        """Map each word form, subtoken or term id to its begin and end indices

        Returns
        -------
        map of target ids to start and end indices
        """
        if not self.has_layer('text'):
            return {}
        id_map = {}
        for wf in self.get('text'):
            id_map[wf.id] = (int(wf.offset), int(wf.offset) + int(wf.length))
            if wf.subtokens:
                id_map.update({st.id: (int(st.offset), int(st.offset) + int(st.length)) for st in wf.subtokens})
        if self.has_layer('terms'):     # higher layers may reference to terms
            # map term ids to begin/end indices through word-form ids
            twf_map = {t.id: id_map[t.span.target_ids()[0]] for t in self.get('terms')}
            id_map.update(twf_map)
        return id_map

    def add_comments(self):
        """Add covered text as comment to all Span elements that have no comment yet"""
        spans = [x for x in self.root.findall('.//span') if not [_ for _ in x.iter(tag=etree.Comment)]]
        target_ids = [[t.get('id') for t in span.findall('target')] for span in spans]
        if spans and not self.id_map:
            self.id_map = self.targets2indices()
        for span_node, tid_span in zip(spans, target_ids):
            begin, end = self.id_map[tid_span[0]][0], self.id_map[tid_span[-1]][1]
            comment = self.get('raw').text[begin:end]
            comment = comment.replace('--', '-~')
            comment = re.sub('-$', '~', comment)
            span_node.append(etree.Comment(comment))

    def covered_text(self, target_ids: List[str]) -> str:
        """Return text covered by the target ids

        Parameters
        ----------
        target_ids: List[str]
            target ids

        Returns
        -------
        covered text
        """
        start, end = self.start_end_indices(target_ids)
        return self.get('raw').text[start:end]

    def start_end_indices(self, target_ids: List[str]) -> Tuple[int, int]:
        """Return the start and end indices of the span represented by the target ids

        Parameters
        ----------
        target_ids: List[str]
            target ids

        Returns
        -------
        tuple of start and end indices
        """
        if not self.id_map:
            self.id_map = self.targets2indices()
            if not self.id_map:
                raise ValueError('No target ids found')
        return self.id_map[target_ids[0]][0], self.id_map[target_ids[-1]][1]

    def reset_targets2indices(self):
        """Recomputes the mapping of all word forms, subtokens and terms to their start and end indices.

        This mapping is computed in a restricted number of cases: when loading a existing NAF document, or when
        retrieving the covered text on a newly created NAF document. The present function can be called when
        adding layers for which the mapping will be relevant, such as subtokens or terms on a NAF document already
        annotated with word forms.
        """
        self.id_map = self.targets2indices()

Static methods

def load(naf_file, validate_against_dtd=False, decorate=True)

Create a NAF document from a NAF file

Parameters

naf_file : str
path to NAF file
validate_against_dtd : bool
validates input tree against DTD if True
decorate : bool
adds covered text to span nodes

Raises

ValueError : if validate_against_dtd is True, and input file does not conform to the DTD
 
Expand source code
@staticmethod
def load(naf_file, validate_against_dtd=False, decorate=True):
    """Create a NAF document from a NAF file

    Parameters
    ----------
    naf_file : str
        path to NAF file
    validate_against_dtd : bool
        validates input tree against DTD if True
    decorate : bool
        adds covered text to span nodes

    Raises
    ------
    ValueError: if `validate_against_dtd` is True, and input file does not conform to the DTD
    """
    tree = etree.parse(naf_file, etree.XMLParser(remove_blank_text=True, strip_cdata=False))

    if validate_against_dtd:
        validate_dtd(tree)

    return NafParser(tree, decorate=decorate)

Methods

def add_comments(self)

Add covered text as comment to all Span elements that have no comment yet

Expand source code
def add_comments(self):
    """Add covered text as comment to all Span elements that have no comment yet"""
    spans = [x for x in self.root.findall('.//span') if not [_ for _ in x.iter(tag=etree.Comment)]]
    target_ids = [[t.get('id') for t in span.findall('target')] for span in spans]
    if spans and not self.id_map:
        self.id_map = self.targets2indices()
    for span_node, tid_span in zip(spans, target_ids):
        begin, end = self.id_map[tid_span[0]][0], self.id_map[tid_span[-1]][1]
        comment = self.get('raw').text[begin:end]
        comment = comment.replace('--', '-~')
        comment = re.sub('-$', '~', comment)
        span_node.append(etree.Comment(comment))
def add_layer(self, layer_name: str, element: Any, exist_ok=False)

Add a layer to the NAF xml tree

Parameters

layer_name : str
naf layer name
element : Any
layer object
exist_ok : bool
allows replacement of existing layer

Raises

ValueError : if layer already exists and exist_ok is False
 
Expand source code
def add_layer(self, layer_name: str, element: Any, exist_ok=False):
    """Add a layer to the NAF xml tree

    Parameters
    ----------
    layer_name : str
        naf layer name
    element : Any
        layer object
    exist_ok : bool
        allows replacement of existing layer

    Raises
    ------
    ValueError: if layer already exists and `exist_ok` is False
    """
    if self.has_layer(layer_name) and not exist_ok:
        raise ValueError('Layer {} already exists'.format(layer_name))
    else:
        if self.has_layer(layer_name):
            self.root.remove(self.root.find(layer_name))
        self.root.append(element.node())
        if layer_name in ('text', 'terms'):
            self.reset_targets2indices()
        if self.decorate:
            self.add_comments()
def add_layer_from_elements(self, layer_name: str, elements: list, exist_ok=False)

Create container layer from its elements.

This method can be applied to non-empty layers without attributes. This concerns almost all layers, except for NafHeader, Raw and TemporalRelations

Parameters

layer_name : str
naf layer name
elements : list
list of layer elements objects
exist_ok : bool
allows replacement of existing layer

Raises

ValueError : if layer already exists and exist_ok is False
 
Expand source code
def add_layer_from_elements(self, layer_name: str, elements: list, exist_ok=False):
    """Create container layer from its elements.

    This method can be applied to non-empty layers without attributes. This concerns almost all layers,
    except for `NafHeader`, `Raw` and `TemporalRelations`

    Parameters
    ----------
    layer_name : str
        naf layer name
    elements : list
        list of layer elements objects
    exist_ok : bool
        allows replacement of existing layer

    Raises
    ------
    ValueError: if layer already exists and `exist_ok` is False
    """
    self.add_layer(layer_name,
                   create_from_elements[layer_name](elements),
                   exist_ok=exist_ok)
def add_linguistic_processor(self, layer: str, name: str, version: str, lpDependencies=[], attributes={}, add_time_stamp=True, replace=False)

Add a linguistic processor element to the linguistic processors list for the given layer.

Creates a nafHeader layer and/or a linguisticProcessors layer if there is not one yet.

Parameters

layer : str
the name of the layer
name : str
the name of the linguistic processor
version : str
the version of the linguistic processor
lpDependencies : List(LPDependency)
list of linguistic processor dependencies
attributes : dict
optional linguistic processor attributes ('timestamp', 'beginTimestamp', 'endTimestamp', 'hostname')
add_time_stamp : bool
create time stamp
replace : bool
replace or append to lp elements for that layer
Expand source code
def add_linguistic_processor(self, layer: str, name: str, version: str, lpDependencies=[], attributes={},
                             add_time_stamp=True, replace=False):
    """Add a `linguistic processor` element to the linguistic processors list for the given layer.

    Creates a `nafHeader` layer and/or a `linguisticProcessors` layer if there is not one yet.

    Parameters
    ----------
    layer : str
        the name of the layer
    name : str
        the name of the linguistic processor
    version : str
        the version of the linguistic processor
    lpDependencies : List(LPDependency)
        list of linguistic processor dependencies
    attributes : dict
        optional linguistic processor attributes ('timestamp', 'beginTimestamp', 'endTimestamp', 'hostname')
    add_time_stamp : bool
        create time stamp
    replace : bool
        replace or append to `lp` elements for that layer
    """
    if not self.has_layer('nafHeader'):
        self.add_naf_header()
    if add_time_stamp:
        attributes['timestamp'] = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec='seconds')
    self.add_lp(layer, LP(name, version, lpDependencies, attributes), replace)
def add_lp(self, layer: str, linguistic_processor: LP, replace: bool)

Add a linguistic processor element to the linguistic processors list for the given layer.

Creates a linguisticProcessors layer if there is not one yet. Pre-existing linguistic processor elements are replaced if replace is True.

Parameters

layer : str
the name of the layer
linguistic_processor : LP
the linguistic processor
replace : bool
replace or append to lp elements for that layer
Expand source code
def add_lp(self, layer: str, linguistic_processor: LP, replace: bool):
    """Add a linguistic processor element to the linguistic processors list for the given layer.

    Creates a `linguisticProcessors` layer if there is not one yet. Pre-existing linguistic processor elements are
    replaced if `replace` is True.

    Parameters
    ----------
    layer : str
        the name of the layer
    linguistic_processor : LP
        the linguistic processor
    replace : bool
        replace or append to `lp` elements for that layer
    """
    naf_header_node = self.root.find('nafHeader')
    ling_processors_layer_nodes = [lps for lps in naf_header_node.findall('linguisticProcessors')
                                   if lps.get('layer') == layer]
    if not ling_processors_layer_nodes:
        ling_processors_layer_node = LinguisticProcessors(layer, [linguistic_processor]).node()
        naf_header_node.append(ling_processors_layer_node)
    elif replace:
        remove_lps(ling_processors_layer_nodes[0])
        ling_processors_layer_nodes[0].append(linguistic_processor.node())
    else:
        ling_processors_layer_nodes[0].append(linguistic_processor.node())
def add_naf_header(self, fileDesc_attrs={}, public_attrs={}, linguistic_processors=[], exist_ok=False)

Create and add nafHeader layer

Parameters

fileDesc_attrs : dict
fileDesc layer attributes
public_attrs : dict
public layer attributes
linguistic_processors : list[LinguisticProcessors]
list of LinguisticProcessors objects per layer
exist_ok : bool
allows replacement of existing layer
Expand source code
def add_naf_header(self, fileDesc_attrs={}, public_attrs={}, linguistic_processors=[], exist_ok=False):
    """
    Create and add `nafHeader` layer

    Parameters
    ----------
    fileDesc_attrs : dict
        `fileDesc` layer attributes
    public_attrs : dict
        `public` layer attributes
    linguistic_processors : list[LinguisticProcessors]
        list of `LinguisticProcessors` objects per layer
    exist_ok : bool
        allows replacement of existing layer
    """
    self.add_layer('nafHeader', NafHeader.create(fileDesc_attrs, public_attrs, linguistic_processors), exist_ok)
def add_raw_layer(self, text: str, exist_ok=False)

Add (or replace) raw layer from text

Parameters

text : str
raw layer text
exist_ok : bool
allows replacement of existing layer
Expand source code
def add_raw_layer(self, text: str, exist_ok=False):
    """Add (or replace) raw layer from text

    Parameters
    ----------
    text : str
        raw layer text
    exist_ok : bool
        allows replacement of existing layer"""
    self.add_layer('raw', Raw(text), exist_ok)
def covered_text(self, target_ids: List[str]) ‑> str

Return text covered by the target ids

Parameters

target_ids : List[str]
target ids

Returns

covered text
 
Expand source code
def covered_text(self, target_ids: List[str]) -> str:
    """Return text covered by the target ids

    Parameters
    ----------
    target_ids: List[str]
        target ids

    Returns
    -------
    covered text
    """
    start, end = self.start_end_indices(target_ids)
    return self.get('raw').text[start:end]
def extend_lps(self, layer: str, linguistic_processors: List[LP], replace=False)

Add linguistic processor elements to the linguistic processors list for the given layer.

Creates a linguisticProcessors layer if there is not one yet.

Parameters

layer : str
the name of the layer
linguistic_processors : List[LP]
the linguistic processors
Expand source code
def extend_lps(self, layer: str, linguistic_processors: List[LP], replace=False):
    """Add linguistic processor elements to the linguistic processors list for the given layer.

    Creates a `linguisticProcessors` layer if there is not one yet.

    Parameters
    ----------
    layer : str
        the name of the layer
    linguistic_processors : List[LP]
        the linguistic processors
    """
    naf_header_node = self.root.find('nafHeader')
    ling_processors_layer_nodes = [lps for lps in naf_header_node.findall('linguisticProcessors')
                                   if lps.get('layer') == layer]
    if not ling_processors_layer_nodes:
        ling_processors_layer_node = LinguisticProcessors(layer, linguistic_processors).node()
        naf_header_node.append(ling_processors_layer_node)
    elif replace:
        remove_lps(ling_processors_layer_nodes[0])
        ling_processors_layer_nodes[0] = [lp.node() for lp in linguistic_processors]
    else:
        ling_processors_layer_nodes[0].extend([lp.node() for lp in linguistic_processors])
def get(self, layer_name: str)

Return a layer object for the layer with the given layer-name.

Returns only the first object if more elements carry the same name.

Expand source code
def get(self, layer_name: str):
    """Return a layer object for the layer with the given layer-name.

    Returns only the first object if more elements carry the same name."""
    if not self.has_layer(layer_name):
        raise ValueError("layer {} does not exist".format(layer_name))
    nodes = self.root.findall('.//{}'.format(layer_name))
    return create_from_node[layer_name](nodes[0])
def get_lps(self, layer_name)

Return list of linguistic processors for a given layer

Parameters

layer_name : str
layer name

Returns

list of Lp objects
 

Raises

ValueError : if the NAF header has no linguisticProcessors element for that layer
 
Expand source code
def get_lps(self, layer_name):
    """Return list of linguistic processors for a given layer

    Parameters
    ----------
    layer_name: str
        layer name

    Returns
    -------
    list of Lp objects

    Raises
    ------
    ValueError: if the NAF header has no linguisticProcessors element for that layer"""

    lprocessors = [x for x in self.getall('linguisticProcessors') if x.layer_name == layer_name]
    if lprocessors:
        return lprocessors[0].lps
    else:
        return None
def getall(self, layer_name: str)

Return a list of layer objects for each layer carrying the given layer-name

Expand source code
def getall(self, layer_name: str):
    """Return a list of layer objects for each layer carrying the given layer-name
    """
    if not self.has_layer(layer_name):
        raise ValueError("layer {} does not exist".format(layer_name))
    nodes = self.root.findall('.//{}'.format(layer_name))
    return [create_from_node[layer_name](node) for node in nodes]
def has_layer(self, layer: str)

Returns True if layer with given name exists

Expand source code
def has_layer(self, layer: str):
    """Returns True if layer with given name exists"""
    return self.root.findall('.//{}'.format(layer))
def reset_targets2indices(self)

Recomputes the mapping of all word forms, subtokens and terms to their start and end indices.

This mapping is computed in a restricted number of cases: when loading a existing NAF document, or when retrieving the covered text on a newly created NAF document. The present function can be called when adding layers for which the mapping will be relevant, such as subtokens or terms on a NAF document already annotated with word forms.

Expand source code
def reset_targets2indices(self):
    """Recomputes the mapping of all word forms, subtokens and terms to their start and end indices.

    This mapping is computed in a restricted number of cases: when loading a existing NAF document, or when
    retrieving the covered text on a newly created NAF document. The present function can be called when
    adding layers for which the mapping will be relevant, such as subtokens or terms on a NAF document already
    annotated with word forms.
    """
    self.id_map = self.targets2indices()
def start_end_indices(self, target_ids: List[str]) ‑> Tuple[int, int]

Return the start and end indices of the span represented by the target ids

Parameters

target_ids : List[str]
target ids

Returns

tuple of start and end indices
 
Expand source code
def start_end_indices(self, target_ids: List[str]) -> Tuple[int, int]:
    """Return the start and end indices of the span represented by the target ids

    Parameters
    ----------
    target_ids: List[str]
        target ids

    Returns
    -------
    tuple of start and end indices
    """
    if not self.id_map:
        self.id_map = self.targets2indices()
        if not self.id_map:
            raise ValueError('No target ids found')
    return self.id_map[target_ids[0]][0], self.id_map[target_ids[-1]][1]
def targets2indices(self) ‑> Dict[str, Tuple[int, int]]

Map each word form, subtoken or term id to its begin and end indices

Returns

map of target ids to start and end indices
 
Expand source code
def targets2indices(self) -> Dict[str, Tuple[int, int]]:
    """Map each word form, subtoken or term id to its begin and end indices

    Returns
    -------
    map of target ids to start and end indices
    """
    if not self.has_layer('text'):
        return {}
    id_map = {}
    for wf in self.get('text'):
        id_map[wf.id] = (int(wf.offset), int(wf.offset) + int(wf.length))
        if wf.subtokens:
            id_map.update({st.id: (int(st.offset), int(st.offset) + int(st.length)) for st in wf.subtokens})
    if self.has_layer('terms'):     # higher layers may reference to terms
        # map term ids to begin/end indices through word-form ids
        twf_map = {t.id: id_map[t.span.target_ids()[0]] for t in self.get('terms')}
        id_map.update(twf_map)
    return id_map
def write(self, file_path)

Write NAF tree to file or stdout if no file path is given

Expand source code
def write(self, file_path):
    """Write NAF tree to file or stdout if no file path is given"""
    if file_path is None:
        print(etree.tostring(self.root, encoding='UTF-8', pretty_print=True, xml_declaration=True))
    else:
        self.tree.write(file_path, encoding='UTF-8', pretty_print=True, xml_declaration=True)