Source code for sentency.sentency

import re

from spacy.language import Language
from spacy.tokens import Doc, Span

from .logs import get_logger

logger = get_logger(__name__)


[docs]@Language.factory(
    "sentex",
    default_config={
        "sentence_regex": "",
        "ignore_regex": "",
        "annotate_ents": False,
        "label": "Sentex",
    },
)
def create_sentex_component(
    nlp: Language,
    name: str,
    sentence_regex: str,
    ignore_regex: str,
    annotate_ents: bool,
    label: str,
):
    return Sentex(nlp, sentence_regex, ignore_regex, annotate_ents, label)


[docs]class Sentex:
    """
    Sentex is a spaCy pipeline component that adds spans to the list `Doc._.sentex`
    based on regular expression matches within each sentence of the document. If an
    `ignore_regex` is given, sentences matching that regular expression will be ignored.

    nlp: `Language`,
        A required argument for spacy to use this as a factory
    sentence_regex : `str`,
        A regular expression to match spans within each sentence of the document.
    ignore_regex : `str`,
        A regular expression to identify sentences that should be ignored.
    annotate_ents: `bool`,
        Write/overwrite matches to Doc.ents
    label: `str`,
        If annotate_ents == True, the label for the matched entity
    """

    def __init__(
        self,
        nlp: Language,
        sentence_regex: str,
        ignore_regex: str,
        annotate_ents: bool,
        label: str,
    ):
        self.sentence_regex = sentence_regex
        self.ignore_regex = ignore_regex
        self.annotate_ents = annotate_ents
        self.label = label

        if not Doc.has_extension("sentex"):
            Doc.set_extension("sentex", default=[])

    def __call__(self, doc: Doc) -> Doc:
        # keep track of previous sentence
        prev_sent = None
        for i, sent in enumerate(doc.sents):
            logger.debug(f"sentence {i}: {sent}")
            should_ignore = self.ignore_regex.strip() != "" and bool(
                re.search(self.ignore_regex, sent.text)
            )
            if should_ignore:
                logger.debug("sentence ignored")
                prev_sent = sent
                continue
            for match in re.finditer(self.sentence_regex, sent.text):
                logger.debug(f"match: {str(match)}")
                start, end = match.span()
                # convert to doc in order to use 'expand' alignment mode
                # in case indicies are inside token boundaries
                span = sent.as_doc().char_span(start, end, alignment_mode="expand")
                if span is not None:
                    # realign span so start/end are relative to doc, not sent
                    span = self._realign_span(doc, span, prev_sent)
                    logger.debug(f"match {span.text} start: {start} end: {end}")
                    logger.debug(
                        f"start {span.start} end {span.end}\
                        start char {span.start_char} end char {span.end_char}"
                    )
                    logger.debug("adding match to sentex")
                    doc._.sentex.append(span)
                else:
                    logger.debug("span is None")
            prev_sent = sent
        if self.annotate_ents:
            self.set_annotations(doc)
        return doc

[docs]    def set_annotations(self, doc):
        """Modify the document in place.
        Logic taken from spacy.pipeline.entityruler.EntityRuler
        """
        entities = list(doc.ents)
        logger.debug(f"current entities: {entities}")
        new_entities = []
        seen_tokens = set()
        matches = self._get_matches(doc)
        for match_id, start, end in matches:
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                span = Span(doc, start, end, label=match_id)
                new_entities.append(span)
                logger.debug(f"new entity: {span.text} start: {start} end: {end}")
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]

                seen_tokens.update(range(start, end))
        doc.ents = entities + new_entities

    def _get_matches(self, doc: Doc):
        return [(self.label, m.start, m.end) for m in doc._.sentex]

    def _realign_span(self, doc: Doc, span: Span, prev_sent: Span):
        offset = 0 if prev_sent is None else prev_sent.end
        start = span.start + offset
        end = span.end + offset
        return doc[start:end]
    
[docs]@Language.factory(
    "size",
    default_config={
        "size_regex": r"(i?)((\d+(\.\d+)?)\s*([Xx]\s*(\d+(\.\d+)?)\s*([Xx]\s*(\d+(\.\d+)?))?)?\s*)(?=\W?[cm]m)",
        "sentex_only": True,
        "annotate_ents": True,
        "label": "SIZE"
    },
)
def create_size_component(
    nlp: Language,
    name: str,
    size_regex: str,
    sentex_only: bool,
    annotate_ents: bool,
    label: str
):
    return Size(nlp, size_regex, sentex_only, annotate_ents, label)


[docs]class Size:
    """
    Size is a spaCy pipeline component that adds spans to the list `Doc._.size`
    based on regular expression matches within each sentence of the document. 

    nlp: `Language`,
        A required argument for spacy to use this as a factory
    size_regex : `str`,
        A regular expression to match spans within each sentence of the document.
    sentex_only : `bool`,
        Only match in sentences with Sentex-matched entities
    annotate_ents: `bool`,
        Write/overwrite matches to Doc.ents
    label: `str`,
        If annotate_ents == True, the label for the matched entity
    """

    def __init__(
        self,
        nlp: Language,
        size_regex: str,
        sentex_only: bool,
        annotate_ents: bool,
        label: str
    ):
        self.size_regex = size_regex
        self.sentex_only = sentex_only
        self.annotate_ents = annotate_ents
        self.label = label

        if not Doc.has_extension("size"):
            Doc.set_extension("size", default=[])

    def __call__(self, doc: Doc) -> Doc:
        self.process(doc)
        if self.annotate_ents:
            self.set_annotations(doc)
        return doc
    
[docs]    def set_annotations(self, doc):
        """Modify the document in place.
        Logic taken from spacy.pipeline.entityruler.EntityRuler
        """
        entities = list(doc.ents)
        logger.debug(f"current entities: {entities}")
        new_entities = []
        seen_tokens = set()
        matches = self._get_matches(doc)
        for match_id, start, end in matches:
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                span = Span(doc, start, end, label=match_id)
                new_entities.append(span)
                logger.debug(f"new entity: {span.text} start: {start} end: {end}")
                entities = [
                    e for e in entities if not (e.start < end and e.end > start)
                ]

                seen_tokens.update(range(start, end))
        doc.ents = entities + new_entities
    
[docs]    def process(self, doc: Doc) -> Doc:
        if self.sentex_only:
            sents = set([span.sent for span in doc._.sentex])
        else:
            sents = doc.sents
        for sent in sents:
            match = re.search(self.size_regex, str(sent))
            if match is None:
                continue
            start, end = match.span()
            num_start = sent.start_char + start
            num_end = sent.start_char + end

            span = sent.doc.char_span(num_start,num_end, alignment_mode='contract')
            logger.debug(f'span: {span} indexes: {num_start} to {num_end}')
            size = self.parse_size(str(span))
            doc._.size.append((span, size))

    def _get_matches(self, doc: Doc):
        return [(self.label, m[0].start, m[0].end) for m in doc._.size]
    
[docs]    def parse_size(self, text: str) -> float:
        sizes = [n for n in text.lower().replace('x',' ').split(' ') if n != '']
        sizes = [float(n) for n in sizes]
        sizes = sizes if len(sizes) <= 2 else sizes[:2]
        size = max(sizes)
        return size
Source code for sentency.sentency

sentency

Navigation

Related Topics